Nick White commited on
Commit
c689941
·
1 Parent(s): f161a5d

ADD initial config and app files

Browse files
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Eagle
3
- emoji: 📊
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: false
10
  license: gpl-3.0
 
1
  ---
2
+ title: YOLO-World + EfficientSAM
3
+ emoji: 🔥
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.19.0
8
  app_file: app.py
9
  pinned: false
10
  license: gpl-3.0
app.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ import os
4
+ import cv2
5
+ import gradio as gr
6
+ import numpy as np
7
+ import supervision as sv
8
+ import torch
9
+ from tqdm import tqdm
10
+ from inference.models import YOLOWorld
11
+
12
+ from utils.efficient_sam import load, inference_with_boxes
13
+ from utils.video import (
14
+ generate_file_name,
15
+ calculate_end_frame_index,
16
+ create_directory,
17
+ remove_files_older_than
18
+ )
19
+
20
+ MARKDOWN = """
21
+ # YOLO-World + EfficientSAM Demo at SafetyCulture🔥
22
+ """
23
+
24
+ RESULTS = "results"
25
+
26
+ IMAGE_EXAMPLES = [
27
+ ['https://media.roboflow.com/dog.jpeg', 'dog, eye, nose, tongue, car', 0.005, 0.1, True, False, False],
28
+ ['https://media.roboflow.com/albert-4x.png', 'hand, hair', 0.005, 0.1, True, False, False],
29
+ ]
30
+ VIDEO_EXAMPLES = [
31
+ ['https://media.roboflow.com/supervision/video-examples/croissant-1280x720.mp4', 'croissant', 0.01, 0.2, False, False, False],
32
+ ['https://media.roboflow.com/supervision/video-examples/suitcases-1280x720.mp4', 'suitcase', 0.1, 0.2, False, False, False],
33
+ ['https://media.roboflow.com/supervision/video-examples/tokyo-walk-1280x720.mp4', 'woman walking', 0.1, 0.2, False, False, False],
34
+ ['https://media.roboflow.com/supervision/video-examples/wooly-mammoth-1280x720.mp4', 'mammoth', 0.01, 0.2, False, False, False],
35
+ ]
36
+
37
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ EFFICIENT_SAM_MODEL = load(device=DEVICE)
39
+ YOLO_WORLD_MODEL = YOLOWorld(model_id="yolo_world/l")
40
+
41
+ BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
42
+ MASK_ANNOTATOR = sv.MaskAnnotator()
43
+ LABEL_ANNOTATOR = sv.LabelAnnotator()
44
+
45
+ # creating video results directory
46
+ create_directory(directory_path=RESULTS)
47
+
48
+
49
+ def process_categories(categories: str) -> List[str]:
50
+ return [category.strip() for category in categories.split(',')]
51
+
52
+
53
+ def annotate_image(
54
+ input_image: np.ndarray,
55
+ detections: sv.Detections,
56
+ categories: List[str],
57
+ with_confidence: bool = False,
58
+ ) -> np.ndarray:
59
+ labels = [
60
+ (
61
+ f"{categories[class_id]}: {confidence:.3f}"
62
+ if with_confidence
63
+ else f"{categories[class_id]}"
64
+ )
65
+ for class_id, confidence in
66
+ zip(detections.class_id, detections.confidence)
67
+ ]
68
+ output_image = MASK_ANNOTATOR.annotate(input_image, detections)
69
+ output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
70
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
71
+ return output_image
72
+
73
+
74
+ def process_image(
75
+ input_image: np.ndarray,
76
+ categories: str,
77
+ confidence_threshold: float = 0.3,
78
+ iou_threshold: float = 0.5,
79
+ with_segmentation: bool = True,
80
+ with_confidence: bool = False,
81
+ with_class_agnostic_nms: bool = False,
82
+ ) -> np.ndarray:
83
+ # cleanup of old video files
84
+ remove_files_older_than(RESULTS, 30)
85
+
86
+ categories = process_categories(categories)
87
+ YOLO_WORLD_MODEL.set_classes(categories)
88
+ results = YOLO_WORLD_MODEL.infer(input_image, confidence=confidence_threshold)
89
+ detections = sv.Detections.from_inference(results)
90
+ detections = detections.with_nms(
91
+ class_agnostic=with_class_agnostic_nms,
92
+ threshold=iou_threshold
93
+ )
94
+ if with_segmentation:
95
+ detections.mask = inference_with_boxes(
96
+ image=input_image,
97
+ xyxy=detections.xyxy,
98
+ model=EFFICIENT_SAM_MODEL,
99
+ device=DEVICE
100
+ )
101
+ output_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
102
+ output_image = annotate_image(
103
+ input_image=output_image,
104
+ detections=detections,
105
+ categories=categories,
106
+ with_confidence=with_confidence
107
+ )
108
+ return cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)
109
+
110
+
111
+ def process_video(
112
+ input_video: str,
113
+ categories: str,
114
+ confidence_threshold: float = 0.3,
115
+ iou_threshold: float = 0.5,
116
+ with_segmentation: bool = True,
117
+ with_confidence: bool = False,
118
+ with_class_agnostic_nms: bool = False,
119
+ progress=gr.Progress(track_tqdm=True)
120
+ ) -> str:
121
+ # cleanup of old video files
122
+ remove_files_older_than(RESULTS, 30)
123
+
124
+ categories = process_categories(categories)
125
+ YOLO_WORLD_MODEL.set_classes(categories)
126
+ video_info = sv.VideoInfo.from_video_path(input_video)
127
+ total = calculate_end_frame_index(input_video)
128
+ frame_generator = sv.get_video_frames_generator(
129
+ source_path=input_video,
130
+ end=total
131
+ )
132
+ result_file_name = generate_file_name(extension="mp4")
133
+ result_file_path = os.path.join(RESULTS, result_file_name)
134
+ with sv.VideoSink(result_file_path, video_info=video_info) as sink:
135
+ for _ in tqdm(range(total), desc="Processing video..."):
136
+ frame = next(frame_generator)
137
+ results = YOLO_WORLD_MODEL.infer(frame, confidence=confidence_threshold)
138
+ detections = sv.Detections.from_inference(results)
139
+ detections = detections.with_nms(
140
+ class_agnostic=with_class_agnostic_nms,
141
+ threshold=iou_threshold
142
+ )
143
+ if with_segmentation:
144
+ detections.mask = inference_with_boxes(
145
+ image=frame,
146
+ xyxy=detections.xyxy,
147
+ model=EFFICIENT_SAM_MODEL,
148
+ device=DEVICE
149
+ )
150
+ frame = annotate_image(
151
+ input_image=frame,
152
+ detections=detections,
153
+ categories=categories,
154
+ with_confidence=with_confidence
155
+ )
156
+ sink.write_frame(frame)
157
+ return result_file_path
158
+
159
+
160
+ confidence_threshold_component = gr.Slider(
161
+ minimum=0,
162
+ maximum=1.0,
163
+ value=0.3,
164
+ step=0.01,
165
+ label="Confidence Threshold",
166
+ info=(
167
+ "The confidence threshold for the YOLO-World model. Lower the threshold to "
168
+ "reduce false negatives, enhancing the model's sensitivity to detect "
169
+ "sought-after objects. Conversely, increase the threshold to minimize false "
170
+ "positives, preventing the model from identifying objects it shouldn't."
171
+ ))
172
+
173
+ iou_threshold_component = gr.Slider(
174
+ minimum=0,
175
+ maximum=1.0,
176
+ value=0.5,
177
+ step=0.01,
178
+ label="IoU Threshold",
179
+ info=(
180
+ "The Intersection over Union (IoU) threshold for non-maximum suppression. "
181
+ "Decrease the value to lessen the occurrence of overlapping bounding boxes, "
182
+ "making the detection process stricter. On the other hand, increase the value "
183
+ "to allow more overlapping bounding boxes, accommodating a broader range of "
184
+ "detections."
185
+ ))
186
+
187
+ with_segmentation_component = gr.Checkbox(
188
+ value=True,
189
+ label="With Segmentation",
190
+ info=(
191
+ "Whether to run EfficientSAM for instance segmentation."
192
+ )
193
+ )
194
+
195
+ with_confidence_component = gr.Checkbox(
196
+ value=False,
197
+ label="Display Confidence",
198
+ info=(
199
+ "Whether to display the confidence of the detected objects."
200
+ )
201
+ )
202
+
203
+ with_class_agnostic_nms_component = gr.Checkbox(
204
+ value=False,
205
+ label="Use Class-Agnostic NMS",
206
+ info=(
207
+ "Suppress overlapping bounding boxes across all classes."
208
+ )
209
+ )
210
+
211
+
212
+ with gr.Blocks() as demo:
213
+ gr.Markdown(MARKDOWN)
214
+ with gr.Accordion("Configuration", open=False):
215
+ confidence_threshold_component.render()
216
+ iou_threshold_component.render()
217
+ with gr.Row():
218
+ with_segmentation_component.render()
219
+ with_confidence_component.render()
220
+ with_class_agnostic_nms_component.render()
221
+ with gr.Tab(label="Image"):
222
+ with gr.Row():
223
+ input_image_component = gr.Image(
224
+ type='numpy',
225
+ label='Input Image'
226
+ )
227
+ output_image_component = gr.Image(
228
+ type='numpy',
229
+ label='Output Image'
230
+ )
231
+ with gr.Row():
232
+ image_categories_text_component = gr.Textbox(
233
+ label='Categories',
234
+ placeholder='comma separated list of categories',
235
+ scale=7
236
+ )
237
+ image_submit_button_component = gr.Button(
238
+ value='Submit',
239
+ scale=1,
240
+ variant='primary'
241
+ )
242
+ gr.Examples(
243
+ fn=process_image,
244
+ examples=IMAGE_EXAMPLES,
245
+ inputs=[
246
+ input_image_component,
247
+ image_categories_text_component,
248
+ confidence_threshold_component,
249
+ iou_threshold_component,
250
+ with_segmentation_component,
251
+ with_confidence_component,
252
+ with_class_agnostic_nms_component
253
+ ],
254
+ outputs=output_image_component
255
+ )
256
+ with gr.Tab(label="Video"):
257
+ with gr.Row():
258
+ input_video_component = gr.Video(
259
+ label='Input Video'
260
+ )
261
+ output_video_component = gr.Video(
262
+ label='Output Video'
263
+ )
264
+ with gr.Row():
265
+ video_categories_text_component = gr.Textbox(
266
+ label='Categories',
267
+ placeholder='comma separated list of categories',
268
+ scale=7
269
+ )
270
+ video_submit_button_component = gr.Button(
271
+ value='Submit',
272
+ scale=1,
273
+ variant='primary'
274
+ )
275
+ gr.Examples(
276
+ fn=process_video,
277
+ examples=VIDEO_EXAMPLES,
278
+ inputs=[
279
+ input_video_component,
280
+ video_categories_text_component,
281
+ confidence_threshold_component,
282
+ iou_threshold_component,
283
+ with_segmentation_component,
284
+ with_confidence_component,
285
+ with_class_agnostic_nms_component
286
+ ],
287
+ outputs=output_image_component
288
+ )
289
+
290
+ image_submit_button_component.click(
291
+ fn=process_image,
292
+ inputs=[
293
+ input_image_component,
294
+ image_categories_text_component,
295
+ confidence_threshold_component,
296
+ iou_threshold_component,
297
+ with_segmentation_component,
298
+ with_confidence_component,
299
+ with_class_agnostic_nms_component
300
+ ],
301
+ outputs=output_image_component
302
+ )
303
+ video_submit_button_component.click(
304
+ fn=process_video,
305
+ inputs=[
306
+ input_video_component,
307
+ video_categories_text_component,
308
+ confidence_threshold_component,
309
+ iou_threshold_component,
310
+ with_segmentation_component,
311
+ with_confidence_component,
312
+ with_class_agnostic_nms_component
313
+ ],
314
+ outputs=output_video_component
315
+ )
316
+
317
+ demo.launch(debug=False, show_error=True, max_threads=1)
efficient_sam_s_cpu.jit ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b63ab268e9020b0fb7fc9f46e742644d4c9ea6e5d9caf56045f0afb6475db09
3
+ size 106006979
efficient_sam_s_gpu.jit ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e47c589ead2c6a80d38050ce63083a551e288db27113d534e0278270fc7cba26
3
+ size 106006979
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ inference-gpu[yolo-world]==0.9.13
2
+ supervision==0.19.0rc3
3
+ gradio==4.19.0
4
+ tqdm==4.66.2
utils/__init__.py ADDED
File without changes
utils/efficient_sam.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from torchvision.transforms import ToTensor
4
+
5
+ GPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_gpu.jit"
6
+ CPU_EFFICIENT_SAM_CHECKPOINT = "efficient_sam_s_cpu.jit"
7
+
8
+
9
+ def load(device: torch.device) -> torch.jit.ScriptModule:
10
+ if device.type == "cuda":
11
+ model = torch.jit.load(GPU_EFFICIENT_SAM_CHECKPOINT)
12
+ else:
13
+ model = torch.jit.load(CPU_EFFICIENT_SAM_CHECKPOINT)
14
+ model.eval()
15
+ return model
16
+
17
+
18
+ def inference_with_box(
19
+ image: np.ndarray,
20
+ box: np.ndarray,
21
+ model: torch.jit.ScriptModule,
22
+ device: torch.device
23
+ ) -> np.ndarray:
24
+ bbox = torch.reshape(torch.tensor(box), [1, 1, 2, 2])
25
+ bbox_labels = torch.reshape(torch.tensor([2, 3]), [1, 1, 2])
26
+ img_tensor = ToTensor()(image)
27
+
28
+ predicted_logits, predicted_iou = model(
29
+ img_tensor[None, ...].to(device),
30
+ bbox.to(device),
31
+ bbox_labels.to(device),
32
+ )
33
+ predicted_logits = predicted_logits.cpu()
34
+ all_masks = torch.ge(torch.sigmoid(predicted_logits[0, 0, :, :, :]), 0.5).numpy()
35
+ predicted_iou = predicted_iou[0, 0, ...].cpu().detach().numpy()
36
+
37
+ max_predicted_iou = -1
38
+ selected_mask_using_predicted_iou = None
39
+ for m in range(all_masks.shape[0]):
40
+ curr_predicted_iou = predicted_iou[m]
41
+ if (
42
+ curr_predicted_iou > max_predicted_iou
43
+ or selected_mask_using_predicted_iou is None
44
+ ):
45
+ max_predicted_iou = curr_predicted_iou
46
+ selected_mask_using_predicted_iou = all_masks[m]
47
+ return selected_mask_using_predicted_iou
48
+
49
+
50
+ def inference_with_boxes(
51
+ image: np.ndarray,
52
+ xyxy: np.ndarray,
53
+ model: torch.jit.ScriptModule,
54
+ device: torch.device
55
+ ) -> np.ndarray:
56
+ masks = []
57
+ for [x_min, y_min, x_max, y_max] in xyxy:
58
+ box = np.array([[x_min, y_min], [x_max, y_max]])
59
+ mask = inference_with_box(image, box, model, device)
60
+ masks.append(mask)
61
+ return np.array(masks)
utils/video.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import uuid
4
+ from typing import List
5
+
6
+ import supervision as sv
7
+
8
+
9
+ MAX_VIDEO_LENGTH_SEC = 2
10
+
11
+
12
+ def generate_file_name(extension="mp4"):
13
+ current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
14
+ unique_id = uuid.uuid4()
15
+ return f"{current_datetime}_{unique_id}.{extension}"
16
+
17
+
18
+ def list_files_older_than(directory: str, diff_minutes: int) -> List[str]:
19
+ diff_seconds = diff_minutes * 60
20
+ now = datetime.datetime.now()
21
+ older_files: List[str] = []
22
+
23
+ for filename in os.listdir(directory):
24
+ file_path = os.path.join(directory, filename)
25
+ if os.path.isfile(file_path):
26
+ file_mod_time = os.path.getmtime(file_path)
27
+ file_mod_datetime = datetime.datetime.fromtimestamp(file_mod_time)
28
+ time_diff = now - file_mod_datetime
29
+ if time_diff.total_seconds() > diff_seconds:
30
+ older_files.append(file_path)
31
+
32
+ return older_files
33
+
34
+
35
+ def remove_files_older_than(directory: str, diff_minutes: int) -> None:
36
+ older_files = list_files_older_than(directory, diff_minutes)
37
+ file_count = len(older_files)
38
+
39
+ for file_path in older_files:
40
+ os.remove(file_path)
41
+
42
+ now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
43
+ print(
44
+ f"[{now}] Removed {file_count} files older than {diff_minutes} minutes from "
45
+ f"'{directory}' directory."
46
+ )
47
+
48
+
49
+ def calculate_end_frame_index(source_video_path: str) -> int:
50
+ video_info = sv.VideoInfo.from_video_path(source_video_path)
51
+ return min(
52
+ video_info.total_frames,
53
+ video_info.fps * MAX_VIDEO_LENGTH_SEC
54
+ )
55
+
56
+
57
+ def create_directory(directory_path: str) -> None:
58
+ if not os.path.exists(directory_path):
59
+ os.makedirs(directory_path)