Spaces:

n-e-w
/

eagle

Runtime error

App Files Files Community

Nick White commited on Apr 9, 2024

Commit

c689941

1 Parent(s): f161a5d

ADD initial config and app files

Browse files

Files changed (8) hide show

README.md +5 -5
app.py +317 -0
efficient_sam_s_cpu.jit +3 -0
efficient_sam_s_gpu.jit +3 -0
requirements.txt +4 -0
utils/__init__.py +0 -0
utils/efficient_sam.py +61 -0
utils/video.py +59 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Eagle
-emoji: 📊
-colorFrom: gray
-colorTo: pink
 sdk: gradio
-sdk_version: 4.26.0
 app_file: app.py
 pinned: false
 license: gpl-3.0

 ---
+title: YOLO-World + EfficientSAM
+emoji: 🔥
+colorFrom: purple
+colorTo: green
 sdk: gradio
+sdk_version: 4.19.0
 app_file: app.py
 pinned: false
 license: gpl-3.0

app.py ADDED Viewed

	@@ -0,0 +1,317 @@

+from typing import List
+import os
+import cv2
+import gradio as gr
+import numpy as np
+import supervision as sv
+import torch
+from tqdm import tqdm
+from inference.models import YOLOWorld
+from utils.efficient_sam import load, inference_with_boxes
+from utils.video import (
+    generate_file_name,
+    calculate_end_frame_index,
+    create_directory,
+    remove_files_older_than
+)
+MARKDOWN = """
+# YOLO-World + EfficientSAM Demo at SafetyCulture🔥
+"""
+RESULTS = "results"
+IMAGE_EXAMPLES = [
+    ['https://media.roboflow.com/dog.jpeg', 'dog, eye, nose, tongue, car', 0.005, 0.1, True, False, False],
+    ['https://media.roboflow.com/albert-4x.png', 'hand, hair', 0.005, 0.1, True, False, False],
+]
+VIDEO_EXAMPLES = [
+    ['https://media.roboflow.com/supervision/video-examples/croissant-1280x720.mp4', 'croissant', 0.01, 0.2, False, False, False],
+    ['https://media.roboflow.com/supervision/video-examples/suitcases-1280x720.mp4', 'suitcase', 0.1, 0.2, False, False, False],
+    ['https://media.roboflow.com/supervision/video-examples/tokyo-walk-1280x720.mp4', 'woman walking', 0.1, 0.2, False, False, False],
+    ['https://media.roboflow.com/supervision/video-examples/wooly-mammoth-1280x720.mp4', 'mammoth', 0.01, 0.2, False, False, False],
+]
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+EFFICIENT_SAM_MODEL = load(device=DEVICE)
+YOLO_WORLD_MODEL = YOLOWorld(model_id="yolo_world/l")
+BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
+MASK_ANNOTATOR = sv.MaskAnnotator()
+LABEL_ANNOTATOR = sv.LabelAnnotator()
+# creating video results directory
+create_directory(directory_path=RESULTS)
+def process_categories(categories: str) -> List[str]:
+    return [category.strip() for category in categories.split(',')]
+def annotate_image(
+    input_image: np.ndarray,
+    detections: sv.Detections,
+    categories: List[str],
+    with_confidence: bool = False,
+) -> np.ndarray:
+    labels = [
+        (
+            f"{categories[class_id]}: {confidence:.3f}"
+            if with_confidence
+            else f"{categories[class_id]}"
+        )
+        for class_id, confidence in
+        zip(detections.class_id, detections.confidence)
+    ]
+    output_image = MASK_ANNOTATOR.annotate(input_image, detections)
+    output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
+    output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
+    return output_image
+def process_image(
+    input_image: np.ndarray,
+    categories: str,
+    confidence_threshold: float = 0.3,
+    iou_threshold: float = 0.5,
+    with_segmentation: bool = True,
+    with_confidence: bool = False,
+    with_class_agnostic_nms: bool = False,
+) -> np.ndarray:
+    # cleanup of old video files
+    remove_files_older_than(RESULTS, 30)
+    categories = process_categories(categories)
+    YOLO_WORLD_MODEL.set_classes(categories)
+    results = YOLO_WORLD_MODEL.infer(input_image, confidence=confidence_threshold)
+    detections = sv.Detections.from_inference(results)
+    detections = detections.with_nms(
+        class_agnostic=with_class_agnostic_nms,
+        threshold=iou_threshold
+    )
+    if with_segmentation:
+        detections.mask = inference_with_boxes(
+            image=input_image,
+            xyxy=detections.xyxy,
+            model=EFFICIENT_SAM_MODEL,
+            device=DEVICE
+        )
+    output_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
+    output_image = annotate_image(
+        input_image=output_image,
+        detections=detections,
+        categories=categories,
+        with_confidence=with_confidence
+    )
+    return cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
+def process_video(
+    input_video: str,
+    categories: str,
+    confidence_threshold: float = 0.3,
+    iou_threshold: float = 0.5,
+    with_segmentation: bool = True,
+    with_confidence: bool = False,
+    with_class_agnostic_nms: bool = False,
+    progress=gr.Progress(track_tqdm=True)
+) -> str:
+    # cleanup of old video files
+    remove_files_older_than(RESULTS, 30)
+    categories = process_categories(categories)
+    YOLO_WORLD_MODEL.set_classes(categories)
+    video_info = sv.VideoInfo.from_video_path(input_video)
+    total = calculate_end_frame_index(input_video)
+    frame_generator = sv.get_video_frames_generator(
+        source_path=input_video,
+        end=total
+    )
+    result_file_name = generate_file_name(extension="mp4")
+    result_file_path = os.path.join(RESULTS, result_file_name)
+    with sv.VideoSink(result_file_path, video_info=video_info) as sink:
+        for _ in tqdm(range(total), desc="Processing video..."):
+            frame = next(frame_generator)
+            results = YOLO_WORLD_MODEL.infer(frame, confidence=confidence_threshold)
+            detections = sv.Detections.from_inference(results)
+            detections = detections.with_nms(
+                class_agnostic=with_class_agnostic_nms,
+                threshold=iou_threshold
+            )
+            if with_segmentation:
+                detections.mask = inference_with_boxes(
+                    image=frame,
+                    xyxy=detections.xyxy,
+                    model=EFFICIENT_SAM_MODEL,
+                    device=DEVICE
+            )
+            frame = annotate_image(
+                input_image=frame,
+                detections=detections,
+                categories=categories,
+                with_confidence=with_confidence
+            )
+            sink.write_frame(frame)
+    return result_file_path
+confidence_threshold_component = gr.Slider(
+    minimum=0,
+    maximum=1.0,
+    value=0.3,
+    step=0.01,
+    label="Confidence Threshold",
+    info=(
+        "The confidence threshold for the YOLO-World model. Lower the threshold to "
+        "reduce false negatives, enhancing the model's sensitivity to detect "
+        "sought-after objects. Conversely, increase the threshold to minimize false "
+        "positives, preventing the model from identifying objects it shouldn't."
+    ))
+iou_threshold_component = gr.Slider(
+    minimum=0,
+    maximum=1.0,
+    value=0.5,
+    step=0.01,
+    label="IoU Threshold",
+    info=(
+        "The Intersection over Union (IoU) threshold for non-maximum suppression. "
+        "Decrease the value to lessen the occurrence of overlapping bounding boxes, "
+        "making the detection process stricter. On the other hand, increase the value "
+        "to allow more overlapping bounding boxes, accommodating a broader range of "
+        "detections."
+    ))
+with_segmentation_component = gr.Checkbox(
+    value=True,
+    label="With Segmentation",
+    info=(
+        "Whether to run EfficientSAM for instance segmentation."
+    )
+)
+with_confidence_component = gr.Checkbox(
+    value=False,
+    label="Display Confidence",
+    info=(
+        "Whether to display the confidence of the detected objects."
+    )
+)
+with_class_agnostic_nms_component = gr.Checkbox(
+    value=False,
+    label="Use Class-Agnostic NMS",
+    info=(
+        "Suppress overlapping bounding boxes across all classes."
+    )
+)
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Accordion("Configuration", open=False):
+        confidence_threshold_component.render()
+        iou_threshold_component.render()
+        with gr.Row():
+            with_segmentation_component.render()
+            with_confidence_component.render()
+            with_class_agnostic_nms_component.render()
+    with gr.Tab(label="Image"):
+        with gr.Row():
+            input_image_component = gr.Image(
+                type='numpy',
+                label='Input Image'
+            )
+            output_image_component = gr.Image(
+                type='numpy',
+                label='Output Image'
+            )
+        with gr.Row():
+            image_categories_text_component = gr.Textbox(
+                label='Categories',
+                placeholder='comma separated list of categories',
+                scale=7
+            )
+            image_submit_button_component = gr.Button(
+                value='Submit',
+                scale=1,
+                variant='primary'
+            )
+        gr.Examples(
+            fn=process_image,
+            examples=IMAGE_EXAMPLES,
+            inputs=[
+                input_image_component,
+                image_categories_text_component,
+                confidence_threshold_component,
+                iou_threshold_component,
+                with_segmentation_component,
+                with_confidence_component,
+                with_class_agnostic_nms_component
+            ],
+            outputs=output_image_component
+        )
+    with gr.Tab(label="Video"):
+        with gr.Row():
+            input_video_component = gr.Video(
+                label='Input Video'
+            )
+            output_video_component = gr.Video(
+                label='Output Video'
+            )
+        with gr.Row():
+            video_categories_text_component = gr.Textbox(
+                label='Categories',
+                placeholder='comma separated list of categories',
+                scale=7
+            )
+            video_submit_button_component = gr.Button(
+                value='Submit',
+                scale=1,
+                variant='primary'
+            )
+        gr.Examples(
+            fn=process_video,
+            examples=VIDEO_EXAMPLES,
+            inputs=[
+                input_video_component,
+                video_categories_text_component,
+                confidence_threshold_component,
+                iou_threshold_component,
+                with_segmentation_component,
+                with_confidence_component,
+                with_class_agnostic_nms_component
+            ],
+            outputs=output_image_component
+        )
+    image_submit_button_component.click(
+        fn=process_image,
+        inputs=[
+            input_image_component,
+            image_categories_text_component,
+            confidence_threshold_component,
+            iou_threshold_component,
+            with_segmentation_component,
+            with_confidence_component,
+            with_class_agnostic_nms_component
+        ],
+        outputs=output_image_component
+    )
+    video_submit_button_component.click(
+        fn=process_video,
+        inputs=[
+            input_video_component,
+            video_categories_text_component,
+            confidence_threshold_component,
+            iou_threshold_component,
+            with_segmentation_component,
+            with_confidence_component,
+            with_class_agnostic_nms_component
+        ],
+        outputs=output_video_component
+    )
+demo.launch(debug=False, show_error=True, max_threads=1)

efficient_sam_s_cpu.jit ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b63ab268e9020b0fb7fc9f46e742644d4c9ea6e5d9caf56045f0afb6475db09
+size 106006979

efficient_sam_s_gpu.jit ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e47c589ead2c6a80d38050ce63083a551e288db27113d534e0278270fc7cba26
+size 106006979

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+inference-gpu[yolo-world]==0.9.13
+supervision==0.19.0rc3
+gradio==4.19.0
+tqdm==4.66.2

utils/__init__.py ADDED Viewed

File without changes

utils/efficient_sam.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+import numpy as np
+from torchvision.transforms import ToTensor
+GPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_gpu.jit"
+CPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_cpu.jit"
+def load(device: torch.device) -> torch.jit.ScriptModule:
+    if device.type == "cuda":
+        model = torch.jit.load(GPU_EFFICIENT_SAM_CHECKPOINT)
+    else:
+        model = torch.jit.load(CPU_EFFICIENT_SAM_CHECKPOINT)
+    model.eval()
+    return model
+def inference_with_box(
+    image: np.ndarray,
+    box: np.ndarray,
+    model: torch.jit.ScriptModule,
+    device: torch.device
+) -> np.ndarray:
+    bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2])
+    bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2])
+    img_tensor = ToTensor()(image)
+    predicted_logits, predicted_iou = model(
+        img_tensor[None, ...].to(device),
+        bbox.to(device),
+        bbox_labels.to(device),
+    )
+    predicted_logits = predicted_logits.cpu()
+    all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
+    predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()
+    max_predicted_iou = -1
+    selected_mask_using_predicted_iou = None
+    for m in range(all_masks.shape[0]):
+        curr_predicted_iou = predicted_iou[m]
+        if (
+                curr_predicted_iou > max_predicted_iou
+                or selected_mask_using_predicted_iou is None
+        ):
+            max_predicted_iou = curr_predicted_iou
+            selected_mask_using_predicted_iou = all_masks[m]
+    return selected_mask_using_predicted_iou
+def inference_with_boxes(
+    image: np.ndarray,
+    xyxy: np.ndarray,
+    model: torch.jit.ScriptModule,
+    device: torch.device
+) -> np.ndarray:
+    masks = []
+    for [x_min, y_min, x_max, y_max] in xyxy:
+        box = np.array([[x_min, y_min], [x_max, y_max]])
+        mask = inference_with_box(image, box, model, device)
+        masks.append(mask)
+    return np.array(masks)

utils/video.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import datetime
+import uuid
+from typing import List
+import supervision as sv
+MAX_VIDEO_LENGTH_SEC = 2
+def generate_file_name(extension="mp4"):
+    current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    unique_id = uuid.uuid4()
+    return f"{current_datetime}_{unique_id}.{extension}"
+def list_files_older_than(directory: str, diff_minutes: int) -> List[str]:
+    diff_seconds = diff_minutes * 60
+    now = datetime.datetime.now()
+    older_files: List[str] = []
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        if os.path.isfile(file_path):
+            file_mod_time = os.path.getmtime(file_path)
+            file_mod_datetime = datetime.datetime.fromtimestamp(file_mod_time)
+            time_diff = now - file_mod_datetime
+            if time_diff.total_seconds() > diff_seconds:
+                older_files.append(file_path)
+    return older_files
+def remove_files_older_than(directory: str, diff_minutes: int) -> None:
+    older_files = list_files_older_than(directory, diff_minutes)
+    file_count = len(older_files)
+    for file_path in older_files:
+        os.remove(file_path)
+    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(
+        f"[{now}] Removed {file_count} files older than {diff_minutes} minutes from "
+        f"'{directory}' directory."
+    )
+def calculate_end_frame_index(source_video_path: str) -> int:
+    video_info = sv.VideoInfo.from_video_path(source_video_path)
+    return min(
+        video_info.total_frames,
+        video_info.fps * MAX_VIDEO_LENGTH_SEC
+    )
+def create_directory(directory_path: str) -> None:
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)