Spaces:

aharley
/

alltracker

Running on Zero

App Files Files Community

aharley commited on 12 days ago

Commit

574fdd2

1 Parent(s): 09e82bb

updated comments

Browse files

Files changed (4) hide show

README.md +20 -8
app.py +321 -260
nets/alltracker.py +11 -11
requirements.txt +17 -0

README.md CHANGED Viewed

@@ -1,14 +1,26 @@
 ---
-title: Alltracker
-emoji: 📈
-colorFrom: indigo
 colorTo: yellow
 sdk: gradio
-sdk_version: 5.35.0
 app_file: app.py
-pinned: false
-license: mit
-short_description: Efficient dense tracking
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AllTracker
+emoji: ⚡
+colorFrom: blue
 colorTo: yellow
 sdk: gradio
+sdk_version: 5.34.2
+suggested_hardware: a100-large
+suggested_storage: large
 app_file: app.py
+pinned: true
+license: cc-by-nc-4.0
 ---
+This is a demo for ["AllTracker: Efficient Dense Point Tracking at High Resolution"](https://alltracker.github.io/)
+Paper page: https://huggingface.co/papers/2506.07310
+```
+@inproceedings{harley2025alltracker,
+author    = {Adam W. Harley and Yang You and Xinglong Sun and Yang Zheng and Nikhil Raghuraman and Yunqi Gu and Sheldon Liang and Wen-Hsuan Chu and Achal Dave and Pavel Tokmakov and Suya You and Rares Ambrus and Katerina Fragkiadaki and Leonidas J. Guibas},
+title     = {All{T}racker: {E}fficient Dense Point Tracking at High Resolution}
+booktitle = {ICCV},
+year      = {2025}
+}
+```

app.py CHANGED Viewed

@@ -5,7 +5,9 @@ import os
 import sys
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 import gradio as gr
 import mediapy
@@ -21,6 +23,7 @@ import numpy as np
 import utils.basic
 import utils.improc
 # Generate random colormaps for visualizing different points.
 def get_colors(num_colors: int) -> List[Tuple[int, int, int]]:
@@ -37,63 +40,63 @@ def get_colors(num_colors: int) -> List[Tuple[int, int, int]]:
     random.shuffle(colors)
     return colors
-def get_points_on_a_grid(
-    size: int,
-    extent: Tuple[float, ...],
-    center: Optional[Tuple[float, ...]] = None,
-    device: Optional[torch.device] = torch.device("cpu"),
-):
-    r"""Get a grid of points covering a rectangular region
-    `get_points_on_a_grid(size, extent)` generates a :attr:`size` by
-    :attr:`size` grid fo points distributed to cover a rectangular area
-    specified by `extent`.
-    The `extent` is a pair of integer :math:`(H,W)` specifying the height
-    and width of the rectangle.
-    Optionally, the :attr:`center` can be specified as a pair :math:`(c_y,c_x)`
-    specifying the vertical and horizontal center coordinates. The center
-    defaults to the middle of the extent.
-    Points are distributed uniformly within the rectangle leaving a margin
-    :math:`m=W/64` from the border.
-    It returns a :math:`(1, \text{size} \times \text{size}, 2)` tensor of
-    points :math:`P_{ij}=(x_i, y_i)` where
-    .. math::
-        P_{ij} = \left(
-             c_x + m -\frac{W}{2} + \frac{W - 2m}{\text{size} - 1}\, j,~
-             c_y + m -\frac{H}{2} + \frac{H - 2m}{\text{size} - 1}\, i
-        \right)
-    Points are returned in row-major order.
-    Args:
-        size (int): grid size.
-        extent (tuple): height and with of the grid extent.
-        center (tuple, optional): grid center.
-        device (str, optional): Defaults to `"cpu"`.
-    Returns:
-        Tensor: grid.
-    """
-    if size == 1:
-        return torch.tensor([extent[1] / 2, extent[0] / 2], device=device)[None, None]
-    if center is None:
-        center = [extent[0] / 2, extent[1] / 2]
-    margin = extent[1] / 64
-    range_y = (margin - extent[0] / 2 + center[0], extent[0] / 2 + center[0] - margin)
-    range_x = (margin - extent[1] / 2 + center[1], extent[1] / 2 + center[1] - margin)
-    grid_y, grid_x = torch.meshgrid(
-        torch.linspace(*range_y, size, device=device),
-        torch.linspace(*range_x, size, device=device),
-        indexing="ij",
-    )
-    return torch.stack([grid_x, grid_y], dim=-1).reshape(1, -1, 2)
 def paint_point_track_gpu_scatter(
         frames: np.ndarray,
@@ -382,105 +385,105 @@ def paint_point_track(
     return video
-PREVIEW_WIDTH = 768 # Width of the preview video
-PREVIEW_HEIGHT = 768
 # VIDEO_INPUT_RESO = (384, 512) # Resolution of the input video
 POINT_SIZE = 1 # Size of the query point in the preview video
-FRAME_LIMIT = 300 # Limit the number of frames to process
-def get_point(frame_num, video_queried_preview, query_points, query_points_color, query_count, evt: gr.SelectData):
-    print(f"You selected {(evt.index[0], evt.index[1], frame_num)}")
-    current_frame = video_queried_preview[int(frame_num)]
-    # Get the mouse click
-    query_points[int(frame_num)].append((evt.index[0], evt.index[1], frame_num))
-    # Choose the color for the point from matplotlib colormap
-    color = matplotlib.colormaps.get_cmap("gist_rainbow")(query_count % 20 / 20)
-    color = (int(color[0] * 255), int(color[1] * 255), int(color[2] * 255))
-    # print(f"Color: {color}")
-    query_points_color[int(frame_num)].append(color)
-    # Draw the point on the frame
-    x, y = evt.index
-    current_frame_draw = cv2.circle(current_frame, (x, y), POINT_SIZE, color, -1)
-    # Update the frame
-    video_queried_preview[int(frame_num)] = current_frame_draw
-    # Update the query count
-    query_count += 1
-    return (
-        current_frame_draw, # Updated frame for preview
-        video_queried_preview, # Updated preview video
-        query_points, # Updated query points
-        query_points_color, # Updated query points color
-        query_count # Updated query count
-    )
-def undo_point(frame_num, video_preview, video_queried_preview, query_points, query_points_color, query_count):
-    if len(query_points[int(frame_num)]) == 0:
-        return (
-            video_queried_preview[int(frame_num)],
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count
-        )
-    # Get the last point
-    query_points[int(frame_num)].pop(-1)
-    query_points_color[int(frame_num)].pop(-1)
-    # Redraw the frame
-    current_frame_draw = video_preview[int(frame_num)].copy()
-    for point, color in zip(query_points[int(frame_num)], query_points_color[int(frame_num)]):
-        x, y, _ = point
-        current_frame_draw = cv2.circle(current_frame_draw, (x, y), POINT_SIZE, color, -1)
-    # Update the query count
-    query_count -= 1
-    # Update the frame
-    video_queried_preview[int(frame_num)] = current_frame_draw
-    return (
-        current_frame_draw, # Updated frame for preview
-        video_queried_preview, # Updated preview video
-        query_points, # Updated query points
-        query_points_color, # Updated query points color
-        query_count # Updated query count
-    )
-def clear_frame_fn(frame_num, video_preview, video_queried_preview, query_points, query_points_color, query_count):
-    query_count -= len(query_points[int(frame_num)])
-    query_points[int(frame_num)] = []
-    query_points_color[int(frame_num)] = []
-    video_queried_preview[int(frame_num)] = video_preview[int(frame_num)].copy()
-    return (
-        video_preview[int(frame_num)], # Set the preview frame to the original frame
-        video_queried_preview,
-        query_points, # Cleared query points
-        query_points_color, # Cleared query points color
-        query_count # New query count
-    )
-def clear_all_fn(frame_num, video_preview):
-    return (
-        video_preview[int(frame_num)],
-        video_preview.copy(),
-        [[] for _ in range(len(video_preview))],
-        [[] for _ in range(len(video_preview))],
-        0
-    )
 def choose_frame(frame_num, video_preview_array):
@@ -502,6 +505,11 @@ def preprocess_video_input(video_path):
         new_height, new_width = PREVIEW_HEIGHT, int(PREVIEW_WIDTH * width / height)
     else:
         new_height, new_width = int(PREVIEW_WIDTH * height / width), PREVIEW_WIDTH
     preview_video = mediapy.resize_video(video_arr, (new_height, new_width))
     # input_video = mediapy.resize_video(video_arr, VIDEO_INPUT_RESO)
     # input_video = video_arr
@@ -519,7 +527,7 @@ def preprocess_video_input(video_path):
         input_video, # Resized video input for model
         # None, # video_feature, # Extracted feature
         video_fps, # Set the video FPS
-        gr.update(open=False), # Close the video input drawer
         # tracking_mode, # Set the tracking mode
         preview_video[0], # Set the preview frame to the first frame
         gr.update(minimum=0, maximum=num_frames - 1, value=0, interactive=interactive), # Set slider interactive
@@ -624,20 +632,47 @@ def track(
     torch.cuda.empty_cache()
     with torch.no_grad():
-        # model.forward_sliding(
-        flows_e, visconf_maps_e, _, _ = \
-            model.forward_sliding(video_input[:, query_frame:], iters=4, sw=None, is_training=False)
-        traj_maps_e = flows_e + grid_xy # B,Tf,2,H,W
-        print("5 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         if query_frame > 0:
             backward_flows_e, backward_visconf_maps_e, _, _ = \
-                model.forward_sliding(video_input[:, :query_frame+1].flip([1]), iters=4, sw=None, is_training=False)
-            backward_traj_maps_e = backward_flows_e + grid_xy # B,Tb,2,H,W, reversed
-            backward_traj_maps_e = backward_traj_maps_e.flip([1])[:, :-1] # flip time and drop the overlapped frame
-            backward_visconf_maps_e = backward_visconf_maps_e.flip([1])[:, :-1] # flip time and drop the overlapped frame
             traj_maps_e = torch.cat([backward_traj_maps_e, traj_maps_e], dim=1) # B,T,2,H,W
             visconf_maps_e = torch.cat([backward_visconf_maps_e, visconf_maps_e], dim=1) # B,T,2,H,W
         print("6 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
     # for ind in range(0, video_input.shape[1] - model.step, model.step):
@@ -665,7 +700,8 @@ def track(
     visibs = visconf_maps_e.permute(0,3,4,1,2).reshape(-1,T,2)[:,:,0].numpy()
     confs = visconf_maps_e.permute(0,3,4,1,2).reshape(-1,T,2)[:,:,0].numpy()
-    visibs = (visibs * confs) > 0.9 # N,T
     # sc = (np.array([video_preview.shape[2], video_preview.shape[1]]) / np.array([VIDEO_INPUT_RESO[1], VIDEO_INPUT_RESO[0]])).reshape(1,1,2)
@@ -700,9 +736,31 @@ def track(
     video_file_name = uuid.uuid4().hex + ".mp4"
     video_path = os.path.join(os.path.dirname(__file__), "tmp")
     video_file_path = os.path.join(video_path, video_file_name)
-    os.makedirs(video_path, exist_ok=True)
-    mediapy.write_video(video_file_path, painted_video, fps=video_fps)
     return video_file_path
@@ -719,55 +777,58 @@ with gr.Blocks() as demo:
     is_tracked_query = gr.State([])
     query_count = gr.State(0)
-    gr.Markdown("# 🎨 CoTracker3: Simpler and Better Point Tracking by Pseudo-Labelling Real Videos")
     gr.Markdown("<div style='text-align: left;'> \
-    <p>Welcome to <a href='https://cotracker3.github.io/' target='_blank'>CoTracker</a>! This space demonstrates point (pixel) tracking in videos. \
-    The model tracks points on a grid or points selected by you.  </p> \
-    <p> To get started, simply upload your <b>.mp4</b> video or click on one of the example videos to load them. The shorter the video, the faster the processing. We recommend submitting short videos of length <b>2-7 seconds</b>.</p> \
-    <p> After you uploaded a video, please click \"Submit\" and then click \"Track\" for grid tracking or specify points you want to track before clicking. Enjoy the results! </p>\
-    <p style='text-align: left'>For more details, check out our <a href='https://github.com/facebookresearch/co-tracker' target='_blank'>GitHub Repo</a> ⭐. We thank the authors of LocoTrack for their interactive demo.</p> \
     </div>"
     )
-    gr.Markdown("## First step: upload your video or select an example video, and click submit.")
     with gr.Row():
-        with gr.Accordion("Your video input", open=True) as video_in_drawer:
-            video_in = gr.Video(label="Video Input", format="mp4")
-            submit = gr.Button("Submit", scale=0)
-            import os
-            apple = os.path.join(os.path.dirname(__file__), "videos", "apple.mp4")
-            bear = os.path.join(os.path.dirname(__file__), "videos", "bear.mp4")
-            paragliding_launch = os.path.join(
-                os.path.dirname(__file__), "videos", "paragliding-launch.mp4"
-            )
-            paragliding = os.path.join(os.path.dirname(__file__), "videos", "paragliding.mp4")
-            cat = os.path.join(os.path.dirname(__file__), "videos", "cat.mp4")
-            pillow = os.path.join(os.path.dirname(__file__), "videos", "pillow.mp4")
-            teddy = os.path.join(os.path.dirname(__file__), "videos", "teddy.mp4")
-            backpack = os.path.join(os.path.dirname(__file__), "videos", "backpack.mp4")
-            gr.Examples(examples=[bear, apple, paragliding, paragliding_launch, cat, pillow, teddy, backpack],
-                        inputs = [
-                            video_in
-                        ],
-                        )
-    gr.Markdown("## Second step: Simply click \"Track\" to track a grid of points or select query points on the video before clicking")
     with gr.Row():
         with gr.Column():
             with gr.Row():
                 query_frames = gr.Slider(
                     minimum=0, maximum=100, value=0, step=1, label="Choose Frame", interactive=False)
-            with gr.Row():
-                undo = gr.Button("Undo", interactive=False)
-                clear_frame = gr.Button("Clear Frame", interactive=False)
-                clear_all = gr.Button("Clear All", interactive=False)
             with gr.Row():
                 current_frame = gr.Image(
@@ -799,16 +860,16 @@ with gr.Blocks() as demo:
             video_queried_preview,
             video_input,
             video_fps,
-            video_in_drawer,
             current_frame,
             query_frames,
             query_points,
             query_points_color,
             is_tracked_query,
             query_count,
-            undo,
-            clear_frame,
-            clear_all,
             track_button,
         ],
         queue = False
@@ -823,80 +884,80 @@ with gr.Blocks() as demo:
         queue = False
     )
-    current_frame.select(
-        fn = get_point,
-        inputs = [
-            query_frames,
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count,
-        ],
-        outputs = [
-            current_frame,
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count
-        ],
-        queue = False
-    )
-    undo.click(
-        fn = undo_point,
-        inputs = [
-            query_frames,
-            video_preview,
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count
-        ],
-        outputs = [
-            current_frame,
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count
-        ],
-        queue = False
-    )
-    clear_frame.click(
-        fn = clear_frame_fn,
-        inputs = [
-            query_frames,
-            video_preview,
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count
-        ],
-        outputs = [
-            current_frame,
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count
-        ],
-        queue = False
-    )
-    clear_all.click(
-        fn = clear_all_fn,
-        inputs = [
-            query_frames,
-            video_preview,
-        ],
-        outputs = [
-            current_frame,
-            video_queried_preview,
-            query_points,
-            query_points_color,
-            query_count
-        ],
-        queue = False
-    )
     track_button.click(

 import sys
 import uuid
 from concurrent.futures import ThreadPoolExecutor
+import subprocess
+from nets.blocks import InputPadder
 import gradio as gr
 import mediapy
 import utils.basic
 import utils.improc
+import PIL.Image
 # Generate random colormaps for visualizing different points.
 def get_colors(num_colors: int) -> List[Tuple[int, int, int]]:
     random.shuffle(colors)
     return colors
+# def get_points_on_a_grid(
+#     size: int,
+#     extent: Tuple[float, ...],
+#     center: Optional[Tuple[float, ...]] = None,
+#     device: Optional[torch.device] = torch.device("cpu"),
+# ):
+#     r"""Get a grid of points covering a rectangular region
+#     `get_points_on_a_grid(size, extent)` generates a :attr:`size` by
+#     :attr:`size` grid fo points distributed to cover a rectangular area
+#     specified by `extent`.
+#     The `extent` is a pair of integer :math:`(H,W)` specifying the height
+#     and width of the rectangle.
+#     Optionally, the :attr:`center` can be specified as a pair :math:`(c_y,c_x)`
+#     specifying the vertical and horizontal center coordinates. The center
+#     defaults to the middle of the extent.
+#     Points are distributed uniformly within the rectangle leaving a margin
+#     :math:`m=W/64` from the border.
+#     It returns a :math:`(1, \text{size} \times \text{size}, 2)` tensor of
+#     points :math:`P_{ij}=(x_i, y_i)` where
+#     .. math::
+#         P_{ij} = \left(
+#              c_x + m -\frac{W}{2} + \frac{W - 2m}{\text{size} - 1}\, j,~
+#              c_y + m -\frac{H}{2} + \frac{H - 2m}{\text{size} - 1}\, i
+#         \right)
+#     Points are returned in row-major order.
+#     Args:
+#         size (int): grid size.
+#         extent (tuple): height and with of the grid extent.
+#         center (tuple, optional): grid center.
+#         device (str, optional): Defaults to `"cpu"`.
+#     Returns:
+#         Tensor: grid.
+#     """
+#     if size == 1:
+#         return torch.tensor([extent[1] / 2, extent[0] / 2], device=device)[None, None]
+#     if center is None:
+#         center = [extent[0] / 2, extent[1] / 2]
+#     margin = extent[1] / 64
+#     range_y = (margin - extent[0] / 2 + center[0], extent[0] / 2 + center[0] - margin)
+#     range_x = (margin - extent[1] / 2 + center[1], extent[1] / 2 + center[1] - margin)
+#     grid_y, grid_x = torch.meshgrid(
+#         torch.linspace(*range_y, size, device=device),
+#         torch.linspace(*range_x, size, device=device),
+#         indexing="ij",
+#     )
+#     return torch.stack([grid_x, grid_y], dim=-1).reshape(1, -1, 2)
 def paint_point_track_gpu_scatter(
         frames: np.ndarray,
     return video
+PREVIEW_WIDTH = 1024 # Width of the preview video
+PREVIEW_HEIGHT = 1024
 # VIDEO_INPUT_RESO = (384, 512) # Resolution of the input video
 POINT_SIZE = 1 # Size of the query point in the preview video
+FRAME_LIMIT = 600 # Limit the number of frames to process
+# def get_point(frame_num, video_queried_preview, query_points, query_points_color, query_count, evt: gr.SelectData):
+#     print(f"You selected {(evt.index[0], evt.index[1], frame_num)}")
+#     current_frame = video_queried_preview[int(frame_num)]
+#     # Get the mouse click
+#     query_points[int(frame_num)].append((evt.index[0], evt.index[1], frame_num))
+#     # Choose the color for the point from matplotlib colormap
+#     color = matplotlib.colormaps.get_cmap("gist_rainbow")(query_count % 20 / 20)
+#     color = (int(color[0] * 255), int(color[1] * 255), int(color[2] * 255))
+#     # print(f"Color: {color}")
+#     query_points_color[int(frame_num)].append(color)
+#     # Draw the point on the frame
+#     x, y = evt.index
+#     current_frame_draw = cv2.circle(current_frame, (x, y), POINT_SIZE, color, -1)
+#     # Update the frame
+#     video_queried_preview[int(frame_num)] = current_frame_draw
+#     # Update the query count
+#     query_count += 1
+#     return (
+#         current_frame_draw, # Updated frame for preview
+#         video_queried_preview, # Updated preview video
+#         query_points, # Updated query points
+#         query_points_color, # Updated query points color
+#         query_count # Updated query count
+#     )
+# def undo_point(frame_num, video_preview, video_queried_preview, query_points, query_points_color, query_count):
+#     if len(query_points[int(frame_num)]) == 0:
+#         return (
+#             video_queried_preview[int(frame_num)],
+#             video_queried_preview,
+#             query_points,
+#             query_points_color,
+#             query_count
+#         )
+#     # Get the last point
+#     query_points[int(frame_num)].pop(-1)
+#     query_points_color[int(frame_num)].pop(-1)
+#     # Redraw the frame
+#     current_frame_draw = video_preview[int(frame_num)].copy()
+#     for point, color in zip(query_points[int(frame_num)], query_points_color[int(frame_num)]):
+#         x, y, _ = point
+#         current_frame_draw = cv2.circle(current_frame_draw, (x, y), POINT_SIZE, color, -1)
+#     # Update the query count
+#     query_count -= 1
+#     # Update the frame
+#     video_queried_preview[int(frame_num)] = current_frame_draw
+#     return (
+#         current_frame_draw, # Updated frame for preview
+#         video_queried_preview, # Updated preview video
+#         query_points, # Updated query points
+#         query_points_color, # Updated query points color
+#         query_count # Updated query count
+#     )
+# def clear_frame_fn(frame_num, video_preview, video_queried_preview, query_points, query_points_color, query_count):
+#     query_count -= len(query_points[int(frame_num)])
+#     query_points[int(frame_num)] = []
+#     query_points_color[int(frame_num)] = []
+#     video_queried_preview[int(frame_num)] = video_preview[int(frame_num)].copy()
+#     return (
+#         video_preview[int(frame_num)], # Set the preview frame to the original frame
+#         video_queried_preview,
+#         query_points, # Cleared query points
+#         query_points_color, # Cleared query points color
+#         query_count # New query count
+#     )
+# def clear_all_fn(frame_num, video_preview):
+#     return (
+#         video_preview[int(frame_num)],
+#         video_preview.copy(),
+#         [[] for _ in range(len(video_preview))],
+#         [[] for _ in range(len(video_preview))],
+#         0
+#     )
 def choose_frame(frame_num, video_preview_array):
         new_height, new_width = PREVIEW_HEIGHT, int(PREVIEW_WIDTH * width / height)
     else:
         new_height, new_width = int(PREVIEW_WIDTH * height / width), PREVIEW_WIDTH
+    if height*width > 768*768:
+        new_height = new_height*3//4
+        new_width = new_width*3//4
     preview_video = mediapy.resize_video(video_arr, (new_height, new_width))
     # input_video = mediapy.resize_video(video_arr, VIDEO_INPUT_RESO)
     # input_video = video_arr
         input_video, # Resized video input for model
         # None, # video_feature, # Extracted feature
         video_fps, # Set the video FPS
+        # gr.update(open=True), # open/close the video input drawer
         # tracking_mode, # Set the tracking mode
         preview_video[0], # Set the preview frame to the first frame
         gr.update(minimum=0, maximum=num_frames - 1, value=0, interactive=interactive), # Set slider interactive
     torch.cuda.empty_cache()
     with torch.no_grad():
+        utils.basic.print_stats('video_input', video_input)
+        if query_frame < T-1:
+            flows_e, visconf_maps_e, _, _ = \
+                model(video_input[:, query_frame:], iters=4, sw=None, is_training=False)
+            traj_maps_e = flows_e.cpu() + grid_xy # B,Tf,2,H,W
+            visconf_maps_e = visconf_maps_e.cpu()
+        else:
+            traj_maps_e = torch.zeros((1,0,2,H,W), dtype=torch.float32)
+            visconf_maps_e = torch.zeros((1,0,2,H,W), dtype=torch.float32)
         if query_frame > 0:
             backward_flows_e, backward_visconf_maps_e, _, _ = \
+                model(video_input[:, :query_frame+1].flip([1]), iters=4, sw=None, is_training=False)
+            backward_traj_maps_e = backward_flows_e.cpu() + grid_xy # B,Tb,2,H,W, reversed
+            backward_visconf_maps_e = backward_visconf_maps_e.cpu()
+            backward_traj_maps_e = backward_traj_maps_e.flip([1]) # flip time
+            backward_visconf_maps_e = backward_visconf_maps_e.flip([1]) # flip time
+            if query_frame < T-1:
+                backward_traj_maps_e = backward_traj_maps_e[:, :-1] # drop the overlapped frame
+                backward_visconf_maps_e = backward_visconf_maps_e[:, :-1] # drop the overlapped frame
             traj_maps_e = torch.cat([backward_traj_maps_e, traj_maps_e], dim=1) # B,T,2,H,W
             visconf_maps_e = torch.cat([backward_visconf_maps_e, visconf_maps_e], dim=1) # B,T,2,H,W
+        # if query_frame < T-1:
+        #     flows_e, visconf_maps_e, _, _ = \
+        #         model.forward_sliding(video_input[:, query_frame:], iters=4, sw=None, is_training=False)
+        #     traj_maps_e = flows_e + grid_xy # B,Tf,2,H,W
+        #     print("5 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
+        # else:
+        #     traj_maps_e = torch.zeros((1,0,2,H,W), dtype=torch.float32)
+        #     visconf_maps_e = torch.zeros((1,0,2,H,W), dtype=torch.float32)
+        # if query_frame > 0:
+        #     backward_flows_e, backward_visconf_maps_e, _, _ = \
+        #         model.forward_sliding(video_input[:, :query_frame+1].flip([1]), iters=4, sw=None, is_training=False)
+        #     backward_traj_maps_e = backward_flows_e + grid_xy # B,Tb,2,H,W, reversed
+        #     backward_traj_maps_e = backward_traj_maps_e.flip([1]) # flip time
+        #     backward_visconf_maps_e = backward_visconf_maps_e.flip([1]) # flip time
+        #     if query_frame < T-1:
+        #         backward_traj_maps_e = backward_traj_maps_e[:, :-1] # drop the overlapped frame
+        #         backward_visconf_maps_e = backward_visconf_maps_e[:, :-1] # drop the overlapped frame
+        #     traj_maps_e = torch.cat([backward_traj_maps_e, traj_maps_e], dim=1) # B,T,2,H,W
+        #     visconf_maps_e = torch.cat([backward_visconf_maps_e, visconf_maps_e], dim=1) # B,T,2,H,W
         print("6 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
     # for ind in range(0, video_input.shape[1] - model.step, model.step):
     visibs = visconf_maps_e.permute(0,3,4,1,2).reshape(-1,T,2)[:,:,0].numpy()
     confs = visconf_maps_e.permute(0,3,4,1,2).reshape(-1,T,2)[:,:,0].numpy()
+    visibs = (visibs * confs) > 0.2 # N,T
+    # visibs = (confs) > 0.1 # N,T
     # sc = (np.array([video_preview.shape[2], video_preview.shape[1]]) / np.array([VIDEO_INPUT_RESO[1], VIDEO_INPUT_RESO[0]])).reshape(1,1,2)
     video_file_name = uuid.uuid4().hex + ".mp4"
     video_path = os.path.join(os.path.dirname(__file__), "tmp")
     video_file_path = os.path.join(video_path, video_file_name)
+    os.makedirs(video_path, exist_ok=True)
+    if False:
+        mediapy.write_video(video_file_path, painted_video, fps=video_fps)
+    else:
+        for ti in range(T):
+            temp_out_f = '%s/%03d.jpg' % (video_path, ti)
+            # temp_out_f = '%s/%03d.png' % (video_path, ti)
+            im = PIL.Image.fromarray(painted_video[ti])
+            # im.save(temp_out_f, "PNG", subsampling=0, quality=80)
+            im.save(temp_out_f)
+            print('saved', temp_out_f)
+        # os.system('/usr/bin/ffmpeg -y -hide_banner -loglevel error -f image2 -framerate %d -pattern_type glob -i "%s/*.png" -c:v libx264 -crf 20 -pix_fmt yuv420p %s' % (video_fps, video_path, video_file_path))
+        os.system('/usr/bin/ffmpeg -y -hide_banner -loglevel error -f image2 -framerate %d -pattern_type glob -i "%s/*.jpg" -c:v libx264 -crf 20 -pix_fmt yuv420p %s' % (video_fps, video_path, video_file_path))
+        print('saved', video_file_path)
+        for ti in range(T):
+            # temp_out_f = '%s/%03d.png' % (video_path, ti)
+            temp_out_f = '%s/%03d.jpg' % (video_path, ti)
+            os.remove(temp_out_f)
+            print('deleted', temp_out_f)
+    # out_file = tempfile.NamedTemporaryFile(suffix="out.mp4", delete=False)
+    # subprocess.run(f"ffmpeg -y -loglevel quiet -stats -i {painted_video} -c:v libx264 {out_file.name}".split())
     return video_file_path
     is_tracked_query = gr.State([])
     query_count = gr.State(0)
+    gr.Markdown("# ⚡ AllTracker: Efficient Dense Point Tracking at High Resolution")
     gr.Markdown("<div style='text-align: left;'> \
+    <p>Welcome to <a href='https://alltracker.github.io/' target='_blank'>AllTracker</a>! This space demonstrates point (pixel) tracking in videos. \
+    The model tracks all pixels in a frame that you select.  </p> \
+    <p>To get started, simply upload your <b>.mp4</b> video, or click on one of the example videos. The shorter the video, the faster the processing. We recommend submitting videos under 20 seconds long.</p> \
+    <p>After picking a video, click \"Submit\" to load the frames into the app, and optionally choose a frame (using the slider), and then click \"Track\".</p> \
+    <p>For full info on how this works, check out our <a href='https://github.com/aharley/alltracker/' target='_blank'>GitHub Repo</a>!</p> \
+    <p>Initial code for this Gradio app came from LocoTrack and CoTracker.</p> \
     </div>"
     )
+    gr.Markdown("## Step 1: Select a video, and click \"Submit\".")
     with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                video_in = gr.Video(label="Video Input", format="mp4")
+            with gr.Row():
+                submit = gr.Button("Submit")
+        with gr.Column():
+            # with gr.Accordion("Sample videos", open=True) as video_in_drawer:
+            with gr.Row():
+                dog = os.path.join(os.path.dirname(__file__), "videos", "dog.mp4")
+                monkey = os.path.join(os.path.dirname(__file__), "videos", "monkey_28.mp4")
+                apple = os.path.join(os.path.dirname(__file__), "videos", "apple.mp4")
+                bear = os.path.join(os.path.dirname(__file__), "videos", "bear.mp4")
+                paragliding_launch = os.path.join(
+                    os.path.dirname(__file__), "videos", "paragliding-launch.mp4"
+                )
+                paragliding = os.path.join(os.path.dirname(__file__), "videos", "paragliding.mp4")
+                cat = os.path.join(os.path.dirname(__file__), "videos", "cat.mp4")
+                pillow = os.path.join(os.path.dirname(__file__), "videos", "pillow.mp4")
+                teddy = os.path.join(os.path.dirname(__file__), "videos", "teddy.mp4")
+                backpack = os.path.join(os.path.dirname(__file__), "videos", "backpack.mp4")
+                gr.Examples(examples=[dog, monkey, bear, apple, paragliding, paragliding_launch, cat, pillow, teddy, backpack],
+                            inputs = [
+                                video_in
+                            ],
+                )
+        # with gr.Column():
+        #     gr.Markdown("Choose a video or upload one of your own.")
+    gr.Markdown("## Step 2: Select a frame, and click \"Track\"")
     with gr.Row():
         with gr.Column():
             with gr.Row():
                 query_frames = gr.Slider(
                     minimum=0, maximum=100, value=0, step=1, label="Choose Frame", interactive=False)
+            # with gr.Row():
+            #     undo = gr.Button("Undo", interactive=False)
+            #     clear_frame = gr.Button("Clear Frame", interactive=False)
+            #     clear_all = gr.Button("Clear All", interactive=False)
             with gr.Row():
                 current_frame = gr.Image(
             video_queried_preview,
             video_input,
             video_fps,
+            # video_in_drawer,
             current_frame,
             query_frames,
             query_points,
             query_points_color,
             is_tracked_query,
             query_count,
+            # undo,
+            # clear_frame,
+            # clear_all,
             track_button,
         ],
         queue = False
         queue = False
     )
+    # current_frame.select(
+    #     fn = get_point,
+    #     inputs = [
+    #         query_frames,
+    #         video_queried_preview,
+    #         query_points,
+    #         query_points_color,
+    #         query_count,
+    #     ],
+    #     outputs = [
+    #         current_frame,
+    #         video_queried_preview,
+    #         query_points,
+    #         query_points_color,
+    #         query_count
+    #     ],
+    #     queue = False
+    # )
+    # undo.click(
+    #     fn = undo_point,
+    #     inputs = [
+    #         query_frames,
+    #         video_preview,
+    #         video_queried_preview,
+    #         query_points,
+    #         query_points_color,
+    #         query_count
+    #     ],
+    #     outputs = [
+    #         current_frame,
+    #         video_queried_preview,
+    #         query_points,
+    #         query_points_color,
+    #         query_count
+    #     ],
+    #     queue = False
+    # )
+    # clear_frame.click(
+    #     fn = clear_frame_fn,
+    #     inputs = [
+    #         query_frames,
+    #         video_preview,
+    #         video_queried_preview,
+    #         query_points,
+    #         query_points_color,
+    #         query_count
+    #     ],
+    #     outputs = [
+    #         current_frame,
+    #         video_queried_preview,
+    #         query_points,
+    #         query_points_color,
+    #         query_count
+    #     ],
+    #     queue = False
+    # )
+    # clear_all.click(
+    #     fn = clear_all_fn,
+    #     inputs = [
+    #         query_frames,
+    #         video_preview,
+    #     ],
+    #     outputs = [
+    #         current_frame,
+    #         video_queried_preview,
+    #         query_points,
+    #         query_points_color,
+    #         query_count
+    #     ],
+    #     queue = False
+    # )
     track_button.click(

nets/alltracker.py CHANGED Viewed

@@ -236,7 +236,7 @@ class Net(nn.Module):
         std = torch.as_tensor([0.229, 0.224, 0.225], device=device).reshape(1,1,3,1,1).to(images.dtype)
         images = images / 255.0
         images = (images - mean)/std
-        print("a0 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         T_bak = T
         if stride is not None:
@@ -250,7 +250,7 @@ class Net(nn.Module):
         padder = InputPadder(images_.shape)
         images_ = padder.pad(images_)[0]
-        print("a1 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         _, _, H_pad, W_pad = images_.shape # revised HW
         C, H8, W8 = self.dim*2, H_pad//8, W_pad//8
@@ -261,7 +261,7 @@ class Net(nn.Module):
         fmaps = self.get_fmaps(images_, B, T, sw, is_training).reshape(B,T,C,H8,W8)
         device = fmaps.device
-        print("a2 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         fmap_anchor = fmaps[:,0]
@@ -285,11 +285,11 @@ class Net(nn.Module):
             if self.use_feats8:
                 full_feats8 = torch.zeros((B,T,C2,H_pad//8,W_pad//8), dtype=dtype, device=device)
             visits = np.zeros((T))
-            print("a3 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
             for ii, ind in enumerate(indices):
                 ara = np.arange(ind,ind+S)
-                print('ara', ara)
                 if ii < len(indices)-1:
                     next_ind = indices[ii+1]
                     next_ara = np.arange(next_ind,next_ind+S)
@@ -306,12 +306,12 @@ class Net(nn.Module):
                         feats8 = full_feats8[:,ara].reshape(B*(S),C2,H_pad//8,W_pad//8).detach()
                 else:
                     feats8 = None
-                print("a4 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 flow_predictions, visconf_predictions, flows8, visconfs8, feats8 = self.forward_window(
                     fmap_anchor, fmaps2, visconfs8, iters=iters, flowfeat=feats8, flows8=flows8,
                     is_training=is_training)
-                print("a5 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 unpad_flow_predictions = []
                 unpad_visconf_predictions = []
@@ -320,7 +320,7 @@ class Net(nn.Module):
                     unpad_flow_predictions.append(flow_predictions[i].reshape(B,S,2,H,W))
                     visconf_predictions[i] = padder.unpad(torch.sigmoid(visconf_predictions[i]))
                     unpad_visconf_predictions.append(visconf_predictions[i].reshape(B,S,2,H,W))
-                print("a6 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 full_flows[:,ara] = unpad_flow_predictions[-1].reshape(B,S,2,H,W)
                 full_flows8[:,ara] = flows8.reshape(B,S,2,H_pad//8,W_pad//8)
@@ -329,7 +329,7 @@ class Net(nn.Module):
                 if self.use_feats8:
                     full_feats8[:,ara] = feats8.reshape(B,S,C2,H_pad//8,W_pad//8)
                 visits[ara] += 1
-                print("a7 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 if is_training:
                     all_flow_preds.append(unpad_flow_predictions)
@@ -348,7 +348,7 @@ class Net(nn.Module):
                     full_visconfs8[:,idx] = full_visconfs8[:,nearest]
                     if self.use_feats8:
                         full_feats8[:,idx] = full_feats8[:,nearest]
-                print("a8 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         else: # flow
             flows8 = torch.zeros((B,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
@@ -370,7 +370,7 @@ class Net(nn.Module):
         if (not is_training) and (T > 2):
             full_flows = full_flows[:,:T_bak]
             full_visconfs = full_visconfs[:,:T_bak]
-        print("a9 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         return full_flows, full_visconfs, all_flow_preds, all_visconf_preds

         std = torch.as_tensor([0.229, 0.224, 0.225], device=device).reshape(1,1,3,1,1).to(images.dtype)
         images = images / 255.0
         images = (images - mean)/std
+        # print("a0 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         T_bak = T
         if stride is not None:
         padder = InputPadder(images_.shape)
         images_ = padder.pad(images_)[0]
+        # print("a1 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         _, _, H_pad, W_pad = images_.shape # revised HW
         C, H8, W8 = self.dim*2, H_pad//8, W_pad//8
         fmaps = self.get_fmaps(images_, B, T, sw, is_training).reshape(B,T,C,H8,W8)
         device = fmaps.device
+        # print("a2 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         fmap_anchor = fmaps[:,0]
             if self.use_feats8:
                 full_feats8 = torch.zeros((B,T,C2,H_pad//8,W_pad//8), dtype=dtype, device=device)
             visits = np.zeros((T))
+            # print("a3 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
             for ii, ind in enumerate(indices):
                 ara = np.arange(ind,ind+S)
+                # print('ara', ara)
                 if ii < len(indices)-1:
                     next_ind = indices[ii+1]
                     next_ara = np.arange(next_ind,next_ind+S)
                         feats8 = full_feats8[:,ara].reshape(B*(S),C2,H_pad//8,W_pad//8).detach()
                 else:
                     feats8 = None
+                # print("a4 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 flow_predictions, visconf_predictions, flows8, visconfs8, feats8 = self.forward_window(
                     fmap_anchor, fmaps2, visconfs8, iters=iters, flowfeat=feats8, flows8=flows8,
                     is_training=is_training)
+                # print("a5 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 unpad_flow_predictions = []
                 unpad_visconf_predictions = []
                     unpad_flow_predictions.append(flow_predictions[i].reshape(B,S,2,H,W))
                     visconf_predictions[i] = padder.unpad(torch.sigmoid(visconf_predictions[i]))
                     unpad_visconf_predictions.append(visconf_predictions[i].reshape(B,S,2,H,W))
+                # print("a6 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 full_flows[:,ara] = unpad_flow_predictions[-1].reshape(B,S,2,H,W)
                 full_flows8[:,ara] = flows8.reshape(B,S,2,H_pad//8,W_pad//8)
                 if self.use_feats8:
                     full_feats8[:,ara] = feats8.reshape(B,S,C2,H_pad//8,W_pad//8)
                 visits[ara] += 1
+                # print("a7 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
                 if is_training:
                     all_flow_preds.append(unpad_flow_predictions)
                     full_visconfs8[:,idx] = full_visconfs8[:,nearest]
                     if self.use_feats8:
                         full_feats8[:,idx] = full_feats8[:,nearest]
+                # print("a8 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         else: # flow
             flows8 = torch.zeros((B,2,H_pad//8,W_pad//8), dtype=dtype, device=device)
         if (not is_training) and (T > 2):
             full_flows = full_flows[:,:T_bak]
             full_visconfs = full_visconfs[:,:T_bak]
+        # print("a9 torch.cuda.memory_allocated: %.1fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
         return full_flows, full_visconfs, all_flow_preds, all_visconf_preds

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+numpy==1.26.4
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+tqdm
+gradio
+spaces
+matplotlib
+pillow
+torch==2.2.0
+torchvision==0.17.0
+albumentations
+pytorch-lightning==2.2.5
+opencv-python
+scikit-learn
+scikit-image
+einops
+transformers