assentian1970 commited on
Commit
15fed3e
·
verified ·
1 Parent(s): 1610dc2

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +573 -0
  2. best_yolov11.pt +3 -0
  3. requirements.txt +26 -0
app.py ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ @spaces.GPU
4
+ def debug():
5
+ torch.randn(10).cuda()
6
+ debug()
7
+ import argparse
8
+ from transformers import AutoModel, AutoTokenizer
9
+ from modelscope.hub.snapshot_download import snapshot_download
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+ import io
13
+ import os
14
+ os.system("nvidia-smi")
15
+ import copy
16
+ import requests
17
+ import base64
18
+ import json
19
+ import traceback
20
+ import re
21
+ import gc
22
+ import random
23
+ import io
24
+ import tempfile
25
+ from ultralytics import YOLO
26
+ import numpy as np
27
+ import cv2
28
+ import gradio as gr
29
+ from datetime import datetime
30
+
31
+ # Add this after other model configurations
32
+ YOLO_MODEL = YOLO('./best_yolov11.pt') # Load YOLOv11 model
33
+
34
+ # Check if CUDA is available
35
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
36
+
37
+ # Initialize GPU if available
38
+ if DEVICE == "cuda":
39
+ def debug():
40
+ torch.randn(10).cuda()
41
+ debug()
42
+
43
+ # File type validation
44
+ IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
45
+ VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
46
+
47
+ def get_file_extension(filename):
48
+ return os.path.splitext(filename)[1].lower()
49
+
50
+ def is_image(filename):
51
+ return get_file_extension(filename) in IMAGE_EXTENSIONS
52
+
53
+ def is_video(filename):
54
+ return get_file_extension(filename) in VIDEO_EXTENSIONS
55
+
56
+ # Argparser
57
+ parser = argparse.ArgumentParser(description='demo')
58
+ parser.add_argument('--device', type=str, default='cuda', help='cuda or mps')
59
+ parser.add_argument("--host", type=str, default="0.0.0.0")
60
+ parser.add_argument("--port", type=int)
61
+ args = parser.parse_args()
62
+ device = args.device
63
+ assert device in ['cuda', 'mps']
64
+
65
+ # Model configuration
66
+ MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
67
+ MODEL_CACHE_DIR = os.getenv('TRANSFORMERS_CACHE', './models')
68
+
69
+ # Create cache directory if it doesn't exist
70
+ os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
71
+
72
+ # Download and cache the model
73
+ try:
74
+ model_path = snapshot_download(MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
75
+ except Exception as e:
76
+ print(f"Error downloading model: {str(e)}")
77
+ model_path = os.path.join(MODEL_CACHE_DIR, MODEL_NAME)
78
+
79
+ MAX_NUM_FRAMES = 64
80
+
81
+ def load_model_and_tokenizer():
82
+ """Load a fresh instance of the model and tokenizer"""
83
+ try:
84
+ # Clear GPU memory if using CUDA
85
+ if DEVICE == "cuda":
86
+ torch.cuda.empty_cache()
87
+ gc.collect()
88
+
89
+ model = AutoModel.from_pretrained(
90
+ model_path,
91
+ attn_implementation='flash_attention_2',
92
+ trust_remote_code=True,
93
+ torch_dtype= torch.half,
94
+ device_map='auto'
95
+ )
96
+
97
+ tokenizer = AutoTokenizer.from_pretrained(
98
+ model_path,
99
+ trust_remote_code=True
100
+ )
101
+ model.eval()
102
+ processor = model.init_processor(tokenizer)
103
+ return model, tokenizer, processor
104
+ except Exception as e:
105
+ print(f"Error loading model: {str(e)}")
106
+ raise
107
+
108
+ def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
109
+ """Process a chunk of video frames with mPLUG model"""
110
+ messages = [
111
+ {
112
+ "role": "user",
113
+ "content": prompt,
114
+ "video_frames": video_frames
115
+ }
116
+ ]
117
+
118
+ model_messages = []
119
+ videos = []
120
+
121
+ for msg in messages:
122
+ content_str = msg["content"]
123
+ if "video_frames" in msg and msg["video_frames"]:
124
+ content_str += "<|video|>"
125
+ videos.append(msg["video_frames"])
126
+ model_messages.append({
127
+ "role": msg["role"],
128
+ "content": content_str
129
+ })
130
+
131
+ model_messages.append({
132
+ "role": "assistant",
133
+ "content": ""
134
+ })
135
+
136
+ inputs = processor(
137
+ model_messages,
138
+ images=None,
139
+ videos=videos if videos else None
140
+ )
141
+ inputs.to('cuda')
142
+ inputs.update({
143
+ 'tokenizer': tokenizer,
144
+ 'max_new_tokens': 100,
145
+ 'decode_text': True,
146
+ })
147
+
148
+ response = model.generate(**inputs)
149
+ return response[0]
150
+
151
+ def encode_video_in_chunks(video_path):
152
+ """Extract frames from a video in chunks"""
153
+ vr = VideoReader(video_path, ctx=cpu(0))
154
+ sample_fps = round(vr.get_avg_fps() / 1) # 1 FPS
155
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
156
+
157
+ # Split frame indices into chunks
158
+ chunks = [
159
+ frame_idx[i:i + MAX_NUM_FRAMES]
160
+ for i in range(0, len(frame_idx), MAX_NUM_FRAMES)
161
+ ]
162
+
163
+ for chunk_idx, chunk in enumerate(chunks):
164
+ frames = vr.get_batch(chunk).asnumpy()
165
+ frames = [Image.fromarray(v.astype('uint8')) for v in frames]
166
+ yield chunk_idx, frames
167
+
168
+ def detect_people_and_machinery(media_path):
169
+ """Detect people and machinery using YOLOv11 for both images and videos"""
170
+ try:
171
+ # Initialize counters with maximum values
172
+ max_people_count = 0
173
+ max_machine_types = {
174
+ "Tower Crane": 0,
175
+ "Mobile Crane": 0,
176
+ "Compactor/Roller": 0,
177
+ "Bulldozer": 0,
178
+ "Excavator": 0,
179
+ "Dump Truck": 0,
180
+ "Concrete Mixer": 0,
181
+ "Loader": 0,
182
+ "Pump Truck": 0,
183
+ "Pile Driver": 0,
184
+ "Grader": 0,
185
+ "Other Vehicle": 0
186
+ }
187
+
188
+ # Check if input is video
189
+ if isinstance(media_path, str) and is_video(media_path):
190
+ cap = cv2.VideoCapture(media_path)
191
+ fps = cap.get(cv2.CAP_PROP_FPS)
192
+ sample_rate = max(1, int(fps)) # Sample 1 frame per second
193
+ frame_count = 0 # Initialize frame counter
194
+
195
+ while cap.isOpened():
196
+ ret, frame = cap.read()
197
+ if not ret:
198
+ break
199
+
200
+ # Process every nth frame based on sample rate
201
+ if frame_count % sample_rate == 0:
202
+ results = YOLO_MODEL(frame)
203
+ people, _, machine_types = process_yolo_results(results)
204
+
205
+ # Update maximum counts
206
+ max_people_count = max(max_people_count, people)
207
+ for k, v in machine_types.items():
208
+ max_machine_types[k] = max(max_machine_types[k], v)
209
+
210
+ frame_count += 1
211
+
212
+ cap.release()
213
+
214
+ else:
215
+ # Handle single image
216
+ if isinstance(media_path, str):
217
+ img = cv2.imread(media_path)
218
+ else:
219
+ # Handle PIL Image
220
+ img = cv2.cvtColor(np.array(media_path), cv2.COLOR_RGB2BGR)
221
+
222
+ results = YOLO_MODEL(img)
223
+ max_people_count, _, max_machine_types = process_yolo_results(results)
224
+
225
+ # Filter out machinery types with zero count
226
+ max_machine_types = {k: v for k, v in max_machine_types.items() if v > 0}
227
+ total_machinery_count = sum(max_machine_types.values())
228
+
229
+ return max_people_count, total_machinery_count, max_machine_types
230
+
231
+ except Exception as e:
232
+ print(f"Error in YOLO detection: {str(e)}")
233
+ return 0, 0, {}
234
+
235
+ def process_yolo_results(results):
236
+ """Process YOLO detection results and count people and machinery"""
237
+ people_count = 0
238
+ machine_types = {
239
+ "Tower Crane": 0,
240
+ "Mobile Crane": 0,
241
+ "Compactor/Roller": 0,
242
+ "Bulldozer": 0,
243
+ "Excavator": 0,
244
+ "Dump Truck": 0,
245
+ "Concrete Mixer": 0,
246
+ "Loader": 0,
247
+ "Pump Truck": 0,
248
+ "Pile Driver": 0,
249
+ "Grader": 0,
250
+ "Other Vehicle": 0
251
+ }
252
+
253
+ # Process detection results
254
+ for r in results:
255
+ boxes = r.boxes
256
+ for box in boxes:
257
+ cls = int(box.cls[0])
258
+ conf = float(box.conf[0])
259
+ class_name = YOLO_MODEL.names[cls]
260
+
261
+ # Count people (Worker class)
262
+ if class_name.lower() == 'worker' and conf > 0.5:
263
+ people_count += 1
264
+
265
+ # Map YOLO classes to machinery types
266
+ machinery_mapping = {
267
+ 'tower_crane': "Tower Crane",
268
+ 'mobile_crane': "Mobile Crane",
269
+ 'compactor': "Compactor/Roller",
270
+ 'roller': "Compactor/Roller",
271
+ 'bulldozer': "Bulldozer",
272
+ 'dozer': "Bulldozer",
273
+ 'excavator': "Excavator",
274
+ 'dump_truck': "Dump Truck",
275
+ 'truck': "Dump Truck",
276
+ 'concrete_mixer_truck': "Concrete Mixer",
277
+ 'loader': "Loader",
278
+ 'pump_truck': "Pump Truck",
279
+ 'pile_driver': "Pile Driver",
280
+ 'grader': "Grader",
281
+ 'other_vehicle': "Other Vehicle"
282
+ }
283
+
284
+ # Count machinery
285
+ if conf > 0.5:
286
+ class_lower = class_name.lower()
287
+ for key, value in machinery_mapping.items():
288
+ if key in class_lower:
289
+ machine_types[value] += 1
290
+ break
291
+
292
+ total_machinery = sum(machine_types.values())
293
+ return people_count, total_machinery, machine_types
294
+
295
+ def analyze_video_activities(video_path):
296
+ """Analyze video using mPLUG model with chunking"""
297
+ try:
298
+ all_responses = []
299
+ chunk_generator = encode_video_in_chunks(video_path)
300
+
301
+ for chunk_idx, video_frames in chunk_generator:
302
+ # Load fresh model instance for each chunk
303
+ model, tokenizer, processor = load_model_and_tokenizer()
304
+
305
+ # Process the chunk
306
+ prompt = "Analyze this construction site video chunk and describe the activities happening. Focus on construction activities, machinery usage, and worker actions."
307
+ response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
308
+ all_responses.append(f"Time period {chunk_idx + 1}:\n{response}")
309
+
310
+ # Clean up GPU memory
311
+ del model, tokenizer, processor
312
+ torch.cuda.empty_cache()
313
+ gc.collect()
314
+
315
+ # Combine all responses
316
+ return "\n\n".join(all_responses)
317
+ except Exception as e:
318
+ print(f"Error analyzing video: {str(e)}")
319
+ return "Error analyzing video activities"
320
+
321
+ def process_image(image_path, model, tokenizer, processor, prompt):
322
+ """Process single image with mPLUG model"""
323
+ try:
324
+ image = Image.open(image_path)
325
+ messages = [{
326
+ "role": "user",
327
+ "content": prompt,
328
+ "images": [image]
329
+ }]
330
+
331
+ model_messages = []
332
+ images = []
333
+
334
+ for msg in messages:
335
+ content_str = msg["content"]
336
+ if "images" in msg and msg["images"]:
337
+ content_str += "<|image|>"
338
+ images.extend(msg["images"])
339
+ model_messages.append({
340
+ "role": msg["role"],
341
+ "content": content_str
342
+ })
343
+
344
+ model_messages.append({
345
+ "role": "assistant",
346
+ "content": ""
347
+ })
348
+
349
+ inputs = processor(
350
+ model_messages,
351
+ images=images,
352
+ videos=None
353
+ )
354
+ inputs.to('cuda')
355
+ inputs.update({
356
+ 'tokenizer': tokenizer,
357
+ 'max_new_tokens': 100,
358
+ 'decode_text': True,
359
+ })
360
+
361
+ response = model.generate(**inputs)
362
+ return response[0]
363
+ except Exception as e:
364
+ print(f"Error processing image: {str(e)}")
365
+ return "Error processing image"
366
+
367
+ def analyze_image_activities(image_path):
368
+ """Analyze image using mPLUG model"""
369
+ try:
370
+ model, tokenizer, processor = load_model_and_tokenizer()
371
+ prompt = "Analyze this construction site image and describe the activities happening. Focus on construction activities, machinery usage, and worker actions."
372
+ response = process_image(image_path, model, tokenizer, processor, prompt)
373
+
374
+ del model, tokenizer, processor
375
+ if DEVICE == "cuda":
376
+ torch.cuda.empty_cache()
377
+ gc.collect()
378
+
379
+ return response
380
+ except Exception as e:
381
+ print(f"Error analyzing image: {str(e)}")
382
+ return "Error analyzing image activities"
383
+
384
+
385
+ # ------------------------------------------------------------------
386
+ # NEW: Function to annotate each frame with bounding boxes & counts
387
+ # ------------------------------------------------------------------
388
+ def annotate_video_with_bboxes(video_path):
389
+ """
390
+ Reads the entire video frame-by-frame, runs YOLO, draws bounding boxes,
391
+ writes a per-frame summary of detected classes on the frame, and saves
392
+ as a new annotated video. Returns: annotated_video_path
393
+ """
394
+ cap = cv2.VideoCapture(video_path)
395
+ fps = cap.get(cv2.CAP_PROP_FPS)
396
+ w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
397
+ h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
398
+
399
+ # Create a temp file for output
400
+ out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
401
+ annotated_video_path = out_file.name
402
+ out_file.close()
403
+
404
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
405
+ writer = cv2.VideoWriter(annotated_video_path, fourcc, fps, (w, h))
406
+
407
+ while True:
408
+ ret, frame = cap.read()
409
+ if not ret:
410
+ break
411
+
412
+ results = YOLO_MODEL(frame)
413
+
414
+ # Dictionary to hold per-frame counts of each class
415
+ frame_counts = {}
416
+
417
+ for r in results:
418
+ boxes = r.boxes
419
+ for box in boxes:
420
+ cls_id = int(box.cls[0])
421
+ conf = float(box.conf[0])
422
+ if conf < 0.5:
423
+ continue # Skip low-confidence
424
+
425
+ x1, y1, x2, y2 = box.xyxy[0]
426
+ class_name = YOLO_MODEL.names[cls_id]
427
+
428
+ # Convert to int
429
+ x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
430
+
431
+ # Draw bounding box
432
+ color = (0, 255, 0)
433
+ cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
434
+
435
+ label_text = f"{class_name} {conf:.2f}"
436
+ cv2.putText(frame, label_text, (x1, y1 - 6),
437
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
438
+
439
+ # Increment per-frame class count
440
+ frame_counts[class_name] = frame_counts.get(class_name, 0) + 1
441
+
442
+ # Build a summary line, e.g. "Worker: 2, Excavator: 1, ..."
443
+ summary_str = ", ".join(f"{cls_name}: {count}"
444
+ for cls_name, count in frame_counts.items())
445
+
446
+ # Put the summary text in the top-left
447
+ cv2.putText(
448
+ frame,
449
+ summary_str,
450
+ (15, 30), # position
451
+ cv2.FONT_HERSHEY_SIMPLEX,
452
+ 1.0,
453
+ (255, 255, 0),
454
+ 2
455
+ )
456
+
457
+ writer.write(frame)
458
+
459
+ cap.release()
460
+ writer.release()
461
+ return annotated_video_path
462
+
463
+
464
+
465
+ # ----------------------------------------------------------------------------
466
+ # Update process_diary function to also return an annotated video if it's video
467
+ # ----------------------------------------------------------------------------
468
+ @spaces.GPU
469
+ def process_diary(day, date, total_people, total_machinery, machinery_types, activities, media):
470
+ """Process the site diary entry"""
471
+ if media is None:
472
+ # Return 6 text outputs as before + None for video
473
+ return [day, date, "No media uploaded", "No media uploaded", "No media uploaded", "No media uploaded", None]
474
+
475
+ try:
476
+ if not hasattr(media, 'name'):
477
+ raise ValueError("Invalid file upload")
478
+
479
+ file_ext = get_file_extension(media.name)
480
+ if not (is_image(media.name) or is_video(media.name)):
481
+ raise ValueError(f"Unsupported file type: {file_ext}")
482
+
483
+ with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
484
+ temp_path = temp_file.name
485
+ if hasattr(media, 'name') and os.path.exists(media.name):
486
+ with open(media.name, 'rb') as f:
487
+ temp_file.write(f.read())
488
+ else:
489
+ file_content = media.read() if hasattr(media, 'read') else media
490
+ temp_file.write(file_content if isinstance(file_content, bytes) else file_content.read())
491
+
492
+ detected_people, detected_machinery, detected_machinery_types = detect_people_and_machinery(temp_path)
493
+
494
+ # Default: no annotated video
495
+ annotated_video_path = None
496
+
497
+ if is_image(media.name):
498
+ # If it's an image, do normal image analysis
499
+ detected_activities = analyze_image_activities(temp_path)
500
+ else:
501
+ # If it's a video, do video analysis & also annotate the video
502
+ detected_activities = analyze_video_activities(temp_path)
503
+ annotated_video_path = annotate_video_with_bboxes(temp_path)
504
+
505
+ if os.path.exists(temp_path):
506
+ os.remove(temp_path)
507
+
508
+ detected_types_str = ", ".join([f"{k}: {v}" for k, v in detected_machinery_types.items()])
509
+ # Return 7 outputs (the first 6 as before, plus the annotated video path)
510
+ return [day, date, str(detected_people), str(detected_machinery), detected_types_str, detected_activities, annotated_video_path]
511
+
512
+ except Exception as e:
513
+ print(f"Error processing media: {str(e)}")
514
+ return [day, date, "Error processing media", "Error processing media", "Error processing media", "Error processing media", None]
515
+
516
+
517
+ # Create the Gradio interface
518
+ with gr.Blocks(title="Digital Site Diary") as demo:
519
+ gr.Markdown("# 📝 Digital Site Diary")
520
+
521
+ with gr.Row():
522
+ # User Input Column
523
+ with gr.Column():
524
+ gr.Markdown("### User Input")
525
+ day = gr.Textbox(label="Day",value='9')
526
+ date = gr.Textbox(label="Date", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
527
+ total_people = gr.Number(label="Total Number of People", precision=0, value=10)
528
+ total_machinery = gr.Number(label="Total Number of Machinery", precision=0, value=3)
529
+ machinery_types = gr.Textbox(
530
+ label="Number of Machinery Per Type",
531
+ placeholder="e.g., Excavator: 2, Roller: 1",
532
+ value="Excavator: 2, Roller: 1"
533
+ )
534
+ activities = gr.Textbox(
535
+ label="Activity",
536
+ placeholder="e.g., 9 AM: Excavation, 10 AM: Concreting",
537
+ value="9 AM: Excavation, 10 AM: Concreting",
538
+ lines=3
539
+ )
540
+ media = gr.File(label="Upload Image/Video", file_types=["image", "video"])
541
+ submit_btn = gr.Button("Submit", variant="primary")
542
+
543
+ # Model Detection Column
544
+ with gr.Column():
545
+ gr.Markdown("### Model Detection")
546
+ model_day = gr.Textbox(label="Day")
547
+ model_date = gr.Textbox(label="Date")
548
+ model_people = gr.Textbox(label="Total Number of People")
549
+ model_machinery = gr.Textbox(label="Total Number of Machinery")
550
+ model_machinery_types = gr.Textbox(label="Number of Machinery Per Type")
551
+ model_activities = gr.Textbox(label="Activity", lines=5)
552
+ # NEW: annotated video output
553
+ model_annotated_video = gr.Video(label="Annotated Video")
554
+
555
+ # Connect the submit button to the processing function
556
+ submit_btn.click(
557
+ fn=process_diary,
558
+ inputs=[day, date, total_people, total_machinery, machinery_types, activities, media],
559
+ outputs=[
560
+ model_day,
561
+ model_date,
562
+ model_people,
563
+ model_machinery,
564
+ model_machinery_types,
565
+ model_activities,
566
+ model_annotated_video # The new 7th output
567
+ ]
568
+ )
569
+
570
+ if __name__ == "__main__":
571
+ # launch
572
+ demo.launch(share=False, debug=True, show_api=False, server_port=args.port, server_name=args.host)
573
+
best_yolov11.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cff449e4fd3c5e66fe5a7443b680c5bda1f3613ee83bd2dea49faec5db5be324
3
+ size 40517477
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch --index-url https://download.pytorch.org/whl/cu118
2
+ torchvision --index-url https://download.pytorch.org/whl/cu118
3
+ torchaudio --index-url https://download.pytorch.org/whl/cu118
4
+ icecream
5
+ markdown2
6
+ modelscope
7
+ pydantic
8
+ accelerate
9
+ transformers==4.37.2
10
+ tokenizers
11
+ sentencepiece
12
+ shortuuid
13
+ bitsandbytes
14
+ timm
15
+ requests
16
+ httpx==0.24.0
17
+ uvicorn
18
+ einops-exts
19
+ einops
20
+ scikit-learn
21
+ numpy
22
+ decord
23
+ opencv-python
24
+ #gradio==4.41.0
25
+ http://thunlp.oss-cn-qingdao.aliyuncs.com/multi_modal/never_delete/modelscope_studio-0.4.0.9-py3-none-any.whl
26
+ flash-attn