Spaces:

ustc-community
/

d-fine-object-detection

Running on Zero

App Files Files Community

qubvel-hf HF Staff commited on May 12

Commit

7beae6a

1 Parent(s): 745226b

Add tracking

Browse files

Files changed (2) hide show

app.py +114 -16
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import logging
 import torch
 import spaces
 import numpy as np
 import gradio as gr
 import imageio.v3 as iio
@@ -61,13 +62,29 @@ BATCH_SIZE = 4
 ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov"}
 VIDEO_OUTPUT_DIR = Path("static/videos")
 VIDEO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 VIDEO_EXAMPLES = [
-    {"path": "./examples/videos/dogs_running.mp4", "label": "Local Video"},
-    {"path": "./examples/videos/traffic.mp4", "label": "Local Video"},
-    {"path": "./examples/videos/fast_and_furious.mp4", "label": "Local Video"},
-    {"path": "./examples/videos/break_dance.mp4", "label": "Local Video"},
 ]
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
@@ -88,12 +105,21 @@ def detect_objects(
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     target_size: Optional[Tuple[int, int]] = None,
     batch_size: int = BATCH_SIZE,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model, image_processor = get_model_and_processor(checkpoint)
     model = model.to(device)
     if isinstance(images, np.ndarray) and images.ndim == 4:
         images = [x for x in images]  # split video array into list of images
@@ -125,6 +151,9 @@ def detect_objects(
     # move results to cpu
     for i, result in enumerate(results):
         results[i] = {k: v.cpu() for k, v in result.items()}
     return results, model.config.id2label
@@ -201,9 +230,34 @@ def read_video_k_frames(video_path: str, k: int, read_every_i_frame: int = 1):
     return frames
 def process_video(
     video_path: str,
     checkpoint: str,
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> str:
@@ -224,23 +278,51 @@ def process_video(
     frames = read_video_k_frames(video_path, n_frames_to_read, read_each_i_frame)
     frames = [cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_CUBIC) for frame in frames]
-    box_annotator = sv.BoxAnnotator(thickness=1)
-    label_annotator = sv.LabelAnnotator(text_scale=0.5)
     results, id2label = detect_objects(
         images=np.array(frames),
         checkpoint=checkpoint,
         confidence_threshold=confidence_threshold,
         target_size=(target_height, target_width),
     )
     annotated_frames = []
-    for frame, result in tqdm.tqdm(zip(frames, results), desc="Annotating frames", total=len(frames)):
-        detections = sv.Detections.from_transformers(result, id2label=id2label)
-        detections = detections.with_nms(threshold=0.95, class_agnostic=True)
-        annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
-        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections)
-        annotated_frames.append(annotated_frame)
     output_filename = os.path.join(VIDEO_OUTPUT_DIR, f"output_{uuid.uuid4()}.mp4")
     iio.imwrite(output_filename, annotated_frames, fps=target_fps, codec="h264")
@@ -296,6 +378,18 @@ def create_video_inputs() -> List[gr.components.Component]:
             value=DEFAULT_CHECKPOINT,
             elem_classes="input-component",
         ),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
@@ -380,7 +474,7 @@ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
             with gr.Row():
                 with gr.Column(scale=1, min_width=300):
                     with gr.Group():
-                        video_input, video_checkpoint, video_confidence_threshold = create_video_inputs()
                         video_detect_button, video_clear_button = create_button_row()
                 with gr.Column(scale=2):
                     video_output = gr.Video(
@@ -391,10 +485,10 @@ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
             gr.Examples(
                 examples=[
-                    [example["path"], DEFAULT_CHECKPOINT, DEFAULT_CONFIDENCE_THRESHOLD]
                     for example in VIDEO_EXAMPLES
                 ],
-                inputs=[video_input, video_checkpoint, video_confidence_threshold],
                 outputs=[video_output],
                 fn=process_video,
                 cache_examples=False,
@@ -433,12 +527,16 @@ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         fn=lambda: (
             None,
             DEFAULT_CHECKPOINT,
             DEFAULT_CONFIDENCE_THRESHOLD,
             None,
         ),
         outputs=[
             video_input,
             video_checkpoint,
             video_confidence_threshold,
             video_output,
         ],
@@ -460,7 +558,7 @@ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     # Video detect button
     video_detect_button.click(
         fn=process_video,
-        inputs=[video_input, video_checkpoint, video_confidence_threshold],
         outputs=[video_output],
     )

 import torch
 import spaces
+import trackers
 import numpy as np
 import gradio as gr
 import imageio.v3 as iio
 ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov"}
 VIDEO_OUTPUT_DIR = Path("static/videos")
 VIDEO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+class TrackingAlgorithm:
+    BYTETRACK = "ByteTrack (2021)"
+    DEEPSORT = "DeepSORT (2017)"
+    SORT = "SORT (2016)"
+TRACKERS = [None, TrackingAlgorithm.BYTETRACK, TrackingAlgorithm.DEEPSORT, TrackingAlgorithm.SORT]
 VIDEO_EXAMPLES = [
+    {"path": "./examples/videos/dogs_running.mp4", "label": "Local Video", "tracker": None, "classes": "all"},
+    {"path": "./examples/videos/traffic.mp4", "label": "Local Video", "tracker": TrackingAlgorithm.BYTETRACK, "classes": "car, truck, bus"},
+    {"path": "./examples/videos/fast_and_furious.mp4", "label": "Local Video", "tracker": None, "classes": "all"},
+    {"path": "./examples/videos/break_dance.mp4", "label": "Local Video", "tracker": None, "classes": "all"},
 ]
+# Create a color palette for visualization
+# These hex color codes define different colors for tracking different objects
+color = sv.ColorPalette.from_hex([
+    "#ffff00", "#ff9b00", "#ff8080", "#ff66b2", "#ff66ff", "#b266ff",
+    "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00"
+])
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     target_size: Optional[Tuple[int, int]] = None,
     batch_size: int = BATCH_SIZE,
+    classes: Optional[List[str]] = None,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model, image_processor = get_model_and_processor(checkpoint)
     model = model.to(device)
+    if classes is not None:
+        wrong_classes = [cls for cls in classes if cls not in model.config.label2id]
+        if wrong_classes:
+            gr.Warning(f"Classes not found in model config: {wrong_classes}")
+        keep_ids = [model.config.label2id[cls] for cls in classes if cls in model.config.label2id]
+    else:
+        keep_ids = None
     if isinstance(images, np.ndarray) and images.ndim == 4:
         images = [x for x in images]  # split video array into list of images
     # move results to cpu
     for i, result in enumerate(results):
         results[i] = {k: v.cpu() for k, v in result.items()}
+        if keep_ids is not None:
+            keep = torch.isin(results[i]["labels"], torch.tensor(keep_ids))
+            results[i] = {k: v[keep] for k, v in results[i].items()}
     return results, model.config.id2label
     return frames
+def get_tracker(tracker: str, fps: float):
+    if tracker == TrackingAlgorithm.SORT:
+        return trackers.SORTTracker(frame_rate=fps)
+    elif tracker == TrackingAlgorithm.DEEPSORT:
+        feature_extractor = trackers.DeepSORTFeatureExtractor.from_timm("mobilenetv4_conv_small.e1200_r224_in1k", device="cpu")
+        return trackers.DeepSORTTracker(feature_extractor, frame_rate=fps)
+    elif tracker == TrackingAlgorithm.BYTETRACK:
+        return sv.ByteTrack(frame_rate=int(fps))
+    else:
+        raise ValueError(f"Invalid tracker: {tracker}")
+def update_tracker(tracker, detections, frame):
+    if isinstance(tracker, trackers.SORTTracker):
+        return tracker.update(detections)
+    elif isinstance(tracker, trackers.DeepSORTTracker):
+        return tracker.update(detections, frame)
+    elif isinstance(tracker, sv.ByteTrack):
+        return tracker.update_with_detections(detections)
+    else:
+        raise ValueError(f"Invalid tracker: {tracker}")
 def process_video(
     video_path: str,
     checkpoint: str,
+    tracker_algorithm: Optional[str] = None,
+    classes: str = "all",
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> str:
     frames = read_video_k_frames(video_path, n_frames_to_read, read_each_i_frame)
     frames = [cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_CUBIC) for frame in frames]
+    # Set the color lookup mode to assign colors by track ID
+    # This mean objects with the same track ID will be annotated by the same color
+    color_lookup = sv.ColorLookup.TRACK if tracker_algorithm else sv.ColorLookup.CLASS
+    box_annotator = sv.BoxAnnotator(color, color_lookup=color_lookup, thickness=1)
+    label_annotator = sv.LabelAnnotator(color, color_lookup=color_lookup, text_scale=0.5)
+    trace_annotator = sv.TraceAnnotator(color, color_lookup=color_lookup, thickness=1, trace_length=100)
+    # preprocess classes
+    if classes != "all":
+        classes_list = [cls.strip().lower() for cls in classes.split(",")]
+    else:
+        classes_list = None
     results, id2label = detect_objects(
         images=np.array(frames),
         checkpoint=checkpoint,
         confidence_threshold=confidence_threshold,
         target_size=(target_height, target_width),
+        classes=classes_list,
     )
     annotated_frames = []
+    # detections
+    if tracker_algorithm:
+        tracker = get_tracker(tracker_algorithm, target_fps)
+        for frame, result in progress.tqdm(zip(frames, results), desc="Tracking objects", total=len(frames)):
+            detections = sv.Detections.from_transformers(result, id2label=id2label)
+            detections = detections.with_nms(threshold=0.95, class_agnostic=True)
+            detections = update_tracker(tracker, detections, frame)
+            labels = [f"#{tracker_id} {id2label[class_id]}" for class_id, tracker_id in zip(detections.class_id, detections.tracker_id)]
+            annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
+            annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+            annotated_frame = trace_annotator.annotate(scene=annotated_frame, detections=detections)
+            annotated_frames.append(annotated_frame)
+    else:
+        for frame, result in tqdm.tqdm(zip(frames, results), desc="Annotating frames", total=len(frames)):
+            detections = sv.Detections.from_transformers(result, id2label=id2label)
+            detections = detections.with_nms(threshold=0.95, class_agnostic=True)
+            annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
+            annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections)
+            annotated_frames.append(annotated_frame)
     output_filename = os.path.join(VIDEO_OUTPUT_DIR, f"output_{uuid.uuid4()}.mp4")
     iio.imwrite(output_filename, annotated_frames, fps=target_fps, codec="h264")
             value=DEFAULT_CHECKPOINT,
             elem_classes="input-component",
         ),
+        gr.Dropdown(
+            choices=TRACKERS,
+            label="Select Tracker (Optional)",
+            value=None,
+            elem_classes="input-component",
+        ),
+        gr.TextArea(
+            label="Specify Class Names to Detect (comma separated)",
+            value="all",
+            lines=1,
+            elem_classes="input-component",
+        ),
         gr.Slider(
             minimum=0.1,
             maximum=1.0,
             with gr.Row():
                 with gr.Column(scale=1, min_width=300):
                     with gr.Group():
+                        video_input, video_checkpoint, video_tracker, video_classes, video_confidence_threshold = create_video_inputs()
                         video_detect_button, video_clear_button = create_button_row()
                 with gr.Column(scale=2):
                     video_output = gr.Video(
             gr.Examples(
                 examples=[
+                    [example["path"], DEFAULT_CHECKPOINT, example["tracker"], example["classes"], DEFAULT_CONFIDENCE_THRESHOLD]
                     for example in VIDEO_EXAMPLES
                 ],
+                inputs=[video_input, video_checkpoint, video_tracker, video_classes, video_confidence_threshold],
                 outputs=[video_output],
                 fn=process_video,
                 cache_examples=False,
         fn=lambda: (
             None,
             DEFAULT_CHECKPOINT,
+            None,
+            "all",
             DEFAULT_CONFIDENCE_THRESHOLD,
             None,
         ),
         outputs=[
             video_input,
             video_checkpoint,
+            video_tracker,
+            video_classes,
             video_confidence_threshold,
             video_output,
         ],
     # Video detect button
     video_detect_button.click(
         fn=process_video,
+        inputs=[video_input, video_checkpoint, video_tracker, video_classes, video_confidence_threshold],
         outputs=[video_output],
     )

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ tqdm
 pillow
 supervision
 spaces
-imageio[pyav]

 pillow
 supervision
 spaces
+imageio[pyav]
+trackers @ git+https://github.com/roboflow/trackers