assentian1970 commited on
Commit
1c7efef
·
verified ·
1 Parent(s): a5b7e9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -83
app.py CHANGED
@@ -1,42 +1,32 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- import spaces
5
- import torch
6
-
7
- # Initialize GPU (this decorator ensures that GPU resources are allocated)
8
- @spaces.GPU
9
- def debug():
10
- torch.randn(10).cuda()
11
- debug()
12
-
13
- from datetime import datetime
14
  import gradio as gr
 
 
15
  from transformers import AutoModel, AutoTokenizer
16
  from modelscope.hub.snapshot_download import snapshot_download
17
  from PIL import Image
18
  from decord import VideoReader, cpu
19
  import os
20
  import gc
 
 
21
  import tempfile
22
- import cv2
23
- import numpy as np
24
  from ultralytics import YOLO
 
 
25
 
26
- # -------------------------------
27
- # Model and File Configurations
28
- # -------------------------------
29
-
30
- # Load your custom YOLOv11 model (adjust the path as needed)
31
- YOLO_MODEL = YOLO('/teamspace/studios/this_studio/best_yolov11.pt')
32
 
33
  # Check if CUDA is available
34
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
35
  if DEVICE == "cuda":
36
- # Quick GPU warm-up
37
- torch.randn(10).cuda()
 
38
 
39
- # Valid extensions for media files
40
  IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
41
  VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
42
 
@@ -49,13 +39,14 @@ def is_image(filename):
49
  def is_video(filename):
50
  return get_file_extension(filename) in VIDEO_EXTENSIONS
51
 
52
- # -------------------------------
53
- # mPLUG-Owl Model Configuration
54
- # -------------------------------
55
  MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
56
  MODEL_CACHE_DIR = os.getenv('TRANSFORMERS_CACHE', './models')
 
 
57
  os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
58
 
 
59
  try:
60
  model_path = snapshot_download(MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
61
  except Exception as e:
@@ -65,8 +56,9 @@ except Exception as e:
65
  MAX_NUM_FRAMES = 32
66
 
67
  def load_model_and_tokenizer():
68
- """Load a fresh instance of the model, tokenizer, and processor."""
69
  try:
 
70
  if DEVICE == "cuda":
71
  torch.cuda.empty_cache()
72
  gc.collect()
@@ -78,7 +70,11 @@ def load_model_and_tokenizer():
78
  torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
79
  device_map='auto'
80
  )
81
- tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
 
 
 
82
  model.eval()
83
  processor = model.init_processor(tokenizer)
84
  return model, tokenizer, processor
@@ -86,52 +82,70 @@ def load_model_and_tokenizer():
86
  print(f"Error loading model: {str(e)}")
87
  raise
88
 
89
- # -------------------------------
90
- # Video & Image Processing Functions
91
- # -------------------------------
92
-
93
  def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
94
- """Process a chunk of video frames with the mPLUG model."""
95
- messages = [{
96
- "role": "user",
97
- "content": prompt,
98
- "video_frames": video_frames
99
- }]
 
 
100
 
101
  model_messages = []
102
  videos = []
 
103
  for msg in messages:
104
  content_str = msg["content"]
105
- if msg.get("video_frames"):
106
  content_str += "<|video|>"
107
  videos.append(msg["video_frames"])
108
- model_messages.append({"role": msg["role"], "content": content_str})
109
- model_messages.append({"role": "assistant", "content": ""})
110
-
111
- inputs = processor(model_messages, images=None, videos=videos if videos else None)
 
 
 
 
 
 
 
 
 
 
 
112
  inputs.to('cuda')
113
  inputs.update({
114
  'tokenizer': tokenizer,
115
  'max_new_tokens': 100,
116
  'decode_text': True,
117
  })
 
118
  response = model.generate(**inputs)
119
  return response[0]
120
 
121
  def encode_video_in_chunks(video_path):
122
- """Extract frames from a video and yield them in chunks."""
123
  vr = VideoReader(video_path, ctx=cpu(0))
124
- sample_fps = round(vr.get_avg_fps() / 1) # 1 FPS sampling
125
  frame_idx = [i for i in range(0, len(vr), sample_fps)]
126
- chunks = [frame_idx[i:i + MAX_NUM_FRAMES] for i in range(0, len(frame_idx), MAX_NUM_FRAMES)]
 
 
 
 
 
 
127
  for chunk_idx, chunk in enumerate(chunks):
128
  frames = vr.get_batch(chunk).asnumpy()
129
  frames = [Image.fromarray(v.astype('uint8')) for v in frames]
130
  yield chunk_idx, frames
131
 
132
  def detect_people_and_machinery(media_path):
133
- """Detect people and machinery (using YOLOv11) in an image or video."""
134
  try:
 
135
  max_people_count = 0
136
  max_machine_types = {
137
  "Tower Crane": 0,
@@ -148,49 +162,84 @@ def detect_people_and_machinery(media_path):
148
  "Other Vehicle": 0
149
  }
150
 
 
151
  if isinstance(media_path, str) and is_video(media_path):
152
  cap = cv2.VideoCapture(media_path)
153
  fps = cap.get(cv2.CAP_PROP_FPS)
154
- sample_rate = max(1, int(fps))
155
- frame_count = 0
 
156
  while cap.isOpened():
157
  ret, frame = cap.read()
158
  if not ret:
159
  break
 
 
160
  if frame_count % sample_rate == 0:
161
  results = YOLO_MODEL(frame)
162
  people, _, machine_types = process_yolo_results(results)
 
 
163
  max_people_count = max(max_people_count, people)
164
  for k, v in machine_types.items():
165
  max_machine_types[k] = max(max_machine_types[k], v)
 
166
  frame_count += 1
 
167
  cap.release()
 
168
  else:
169
- img = cv2.imread(media_path) if isinstance(media_path, str) else cv2.cvtColor(np.array(media_path), cv2.COLOR_RGB2BGR)
 
 
 
 
 
 
170
  results = YOLO_MODEL(img)
171
  max_people_count, _, max_machine_types = process_yolo_results(results)
 
 
172
  max_machine_types = {k: v for k, v in max_machine_types.items() if v > 0}
173
  total_machinery_count = sum(max_machine_types.values())
 
174
  return max_people_count, total_machinery_count, max_machine_types
 
175
  except Exception as e:
176
  print(f"Error in YOLO detection: {str(e)}")
177
  return 0, 0, {}
178
 
179
  def process_yolo_results(results):
180
- """Count detected workers and machinery from YOLO results."""
181
  people_count = 0
182
  machine_types = {
183
- "Tower Crane": 0, "Mobile Crane": 0, "Compactor/Roller": 0, "Bulldozer": 0,
184
- "Excavator": 0, "Dump Truck": 0, "Concrete Mixer": 0, "Loader": 0,
185
- "Pump Truck": 0, "Pile Driver": 0, "Grader": 0, "Other Vehicle": 0
 
 
 
 
 
 
 
 
 
186
  }
 
 
187
  for r in results:
188
- for box in r.boxes:
 
189
  cls = int(box.cls[0])
190
  conf = float(box.conf[0])
191
  class_name = YOLO_MODEL.names[cls]
 
 
192
  if class_name.lower() == 'worker' and conf > 0.5:
193
  people_count += 1
 
 
194
  machinery_mapping = {
195
  'tower_crane': "Tower Crane",
196
  'mobile_crane': "Mobile Crane",
@@ -208,33 +257,46 @@ def process_yolo_results(results):
208
  'grader': "Grader",
209
  'other_vehicle': "Other Vehicle"
210
  }
 
 
211
  if conf > 0.5:
 
212
  for key, value in machinery_mapping.items():
213
- if key in class_name.lower():
214
  machine_types[value] += 1
215
  break
216
- return people_count, sum(machine_types.values()), machine_types
 
 
217
 
218
  def analyze_video_activities(video_path):
219
- """Analyze a video by processing it in chunks with the mPLUG model."""
220
  try:
221
  all_responses = []
222
- for chunk_idx, video_frames in encode_video_in_chunks(video_path):
 
 
 
223
  model, tokenizer, processor = load_model_and_tokenizer()
224
- prompt = ("Analyze this construction site video chunk and describe the activities happening. "
225
- "Focus on construction activities, machinery usage, and worker actions.")
 
226
  response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
227
  all_responses.append(f"Time period {chunk_idx + 1}:\n{response}")
 
 
228
  del model, tokenizer, processor
229
  torch.cuda.empty_cache()
230
  gc.collect()
 
 
231
  return "\n\n".join(all_responses)
232
  except Exception as e:
233
  print(f"Error analyzing video: {str(e)}")
234
  return "Error analyzing video activities"
235
 
236
  def process_image(image_path, model, tokenizer, processor, prompt):
237
- """Analyze a single image with the mPLUG model."""
238
  try:
239
  image = Image.open(image_path)
240
  messages = [{
@@ -242,22 +304,37 @@ def process_image(image_path, model, tokenizer, processor, prompt):
242
  "content": prompt,
243
  "images": [image]
244
  }]
 
245
  model_messages = []
246
  images = []
 
247
  for msg in messages:
248
  content_str = msg["content"]
249
- if msg.get("images"):
250
  content_str += "<|image|>"
251
  images.extend(msg["images"])
252
- model_messages.append({"role": msg["role"], "content": content_str})
253
- model_messages.append({"role": "assistant", "content": ""})
254
- inputs = processor(model_messages, images=images, videos=None)
 
 
 
 
 
 
 
 
 
 
 
 
255
  inputs.to('cuda')
256
  inputs.update({
257
  'tokenizer': tokenizer,
258
  'max_new_tokens': 100,
259
  'decode_text': True,
260
  })
 
261
  response = model.generate(**inputs)
262
  return response[0]
263
  except Exception as e:
@@ -265,69 +342,120 @@ def process_image(image_path, model, tokenizer, processor, prompt):
265
  return "Error processing image"
266
 
267
  def analyze_image_activities(image_path):
268
- """Wrapper to analyze an image using mPLUG."""
269
  try:
270
  model, tokenizer, processor = load_model_and_tokenizer()
271
- prompt = ("Analyze this construction site image and describe the activities happening. "
272
- "Focus on construction activities, machinery usage, and worker actions.")
273
  response = process_image(image_path, model, tokenizer, processor, prompt)
 
274
  del model, tokenizer, processor
275
  if DEVICE == "cuda":
276
  torch.cuda.empty_cache()
277
  gc.collect()
 
278
  return response
279
  except Exception as e:
280
  print(f"Error analyzing image: {str(e)}")
281
  return "Error analyzing image activities"
282
 
 
 
 
 
283
  def annotate_video_with_bboxes(video_path):
284
- """Annotate each frame of the video with bounding boxes and a summary of detected classes."""
 
 
 
 
285
  cap = cv2.VideoCapture(video_path)
286
  fps = cap.get(cv2.CAP_PROP_FPS)
287
  w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
288
  h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
 
289
  out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
290
  annotated_video_path = out_file.name
291
  out_file.close()
 
292
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
293
  writer = cv2.VideoWriter(annotated_video_path, fourcc, fps, (w, h))
 
294
  while True:
295
  ret, frame = cap.read()
296
  if not ret:
297
  break
 
298
  results = YOLO_MODEL(frame)
 
 
299
  frame_counts = {}
 
300
  for r in results:
301
- for box in r.boxes:
 
302
  cls_id = int(box.cls[0])
303
  conf = float(box.conf[0])
304
  if conf < 0.5:
305
- continue
 
306
  x1, y1, x2, y2 = box.xyxy[0]
307
  class_name = YOLO_MODEL.names[cls_id]
 
 
308
  x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
309
- cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
 
 
 
 
310
  label_text = f"{class_name} {conf:.2f}"
311
  cv2.putText(frame, label_text, (x1, y1 - 6),
312
  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
 
 
313
  frame_counts[class_name] = frame_counts.get(class_name, 0) + 1
314
- summary_str = ", ".join(f"{cls_name}: {count}" for cls_name, count in frame_counts.items())
315
- cv2.putText(frame, summary_str, (15, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 0), 2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  writer.write(frame)
 
317
  cap.release()
318
  writer.release()
319
  return annotated_video_path
320
 
 
 
 
 
 
321
  def process_diary(day, date, total_people, total_machinery, machinery_types, activities, media):
322
- """Combine the inputs into a site diary entry and perform detection/analysis."""
323
  if media is None:
 
324
  return [day, date, "No media uploaded", "No media uploaded", "No media uploaded", "No media uploaded", None]
 
325
  try:
326
  if not hasattr(media, 'name'):
327
  raise ValueError("Invalid file upload")
 
328
  file_ext = get_file_extension(media.name)
329
  if not (is_image(media.name) or is_video(media.name)):
330
  raise ValueError(f"Unsupported file type: {file_ext}")
 
331
  with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
332
  temp_path = temp_file.name
333
  if hasattr(media, 'name') and os.path.exists(media.name):
@@ -336,32 +464,41 @@ def process_diary(day, date, total_people, total_machinery, machinery_types, act
336
  else:
337
  file_content = media.read() if hasattr(media, 'read') else media
338
  temp_file.write(file_content if isinstance(file_content, bytes) else file_content.read())
 
339
  detected_people, detected_machinery, detected_machinery_types = detect_people_and_machinery(temp_path)
 
 
340
  annotated_video_path = None
 
341
  if is_image(media.name):
 
342
  detected_activities = analyze_image_activities(temp_path)
343
  else:
 
344
  detected_activities = analyze_video_activities(temp_path)
345
  annotated_video_path = annotate_video_with_bboxes(temp_path)
 
346
  if os.path.exists(temp_path):
347
  os.remove(temp_path)
 
348
  detected_types_str = ", ".join([f"{k}: {v}" for k, v in detected_machinery_types.items()])
 
349
  return [day, date, str(detected_people), str(detected_machinery), detected_types_str, detected_activities, annotated_video_path]
 
350
  except Exception as e:
351
  print(f"Error processing media: {str(e)}")
352
  return [day, date, "Error processing media", "Error processing media", "Error processing media", "Error processing media", None]
353
 
354
- # -------------------------------
355
- # Gradio Interface Setup
356
- # -------------------------------
357
 
 
358
  with gr.Blocks(title="Digital Site Diary") as demo:
359
  gr.Markdown("# 📝 Digital Site Diary")
 
360
  with gr.Row():
361
  # User Input Column
362
  with gr.Column():
363
  gr.Markdown("### User Input")
364
- day = gr.Textbox(label="Day", value='9')
365
  date = gr.Textbox(label="Date", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
366
  total_people = gr.Number(label="Total Number of People", precision=0, value=10)
367
  total_machinery = gr.Number(label="Total Number of Machinery", precision=0, value=3)
@@ -378,6 +515,7 @@ with gr.Blocks(title="Digital Site Diary") as demo:
378
  )
379
  media = gr.File(label="Upload Image/Video", file_types=["image", "video"])
380
  submit_btn = gr.Button("Submit", variant="primary")
 
381
  # Model Detection Column
382
  with gr.Column():
383
  gr.Markdown("### Model Detection")
@@ -387,12 +525,23 @@ with gr.Blocks(title="Digital Site Diary") as demo:
387
  model_machinery = gr.Textbox(label="Total Number of Machinery")
388
  model_machinery_types = gr.Textbox(label="Number of Machinery Per Type")
389
  model_activities = gr.Textbox(label="Activity", lines=5)
 
390
  model_annotated_video = gr.Video(label="Annotated Video")
 
 
391
  submit_btn.click(
392
  fn=process_diary,
393
  inputs=[day, date, total_people, total_machinery, machinery_types, activities, media],
394
- outputs=[model_day, model_date, model_people, model_machinery, model_machinery_types, model_activities, model_annotated_video]
 
 
 
 
 
 
 
 
395
  )
396
 
397
  if __name__ == "__main__":
398
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from datetime import datetime
3
+ import torch
4
  from transformers import AutoModel, AutoTokenizer
5
  from modelscope.hub.snapshot_download import snapshot_download
6
  from PIL import Image
7
  from decord import VideoReader, cpu
8
  import os
9
  import gc
10
+ import random
11
+ import io
12
  import tempfile
 
 
13
  from ultralytics import YOLO
14
+ import numpy as np
15
+ import cv2
16
 
17
+ # Add this after other model configurations
18
+ YOLO_MODEL = YOLO('/teamspace/studios/this_studio/best_yolov11.pt') # Load YOLOv11 model
 
 
 
 
19
 
20
  # Check if CUDA is available
21
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
22
+
23
+ # Initialize GPU if available
24
  if DEVICE == "cuda":
25
+ def debug():
26
+ torch.randn(10).cuda()
27
+ debug()
28
 
29
+ # File type validation
30
  IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
31
  VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
32
 
 
39
  def is_video(filename):
40
  return get_file_extension(filename) in VIDEO_EXTENSIONS
41
 
42
+ # Model configuration
 
 
43
  MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
44
  MODEL_CACHE_DIR = os.getenv('TRANSFORMERS_CACHE', './models')
45
+
46
+ # Create cache directory if it doesn't exist
47
  os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
48
 
49
+ # Download and cache the model
50
  try:
51
  model_path = snapshot_download(MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
52
  except Exception as e:
 
56
  MAX_NUM_FRAMES = 32
57
 
58
  def load_model_and_tokenizer():
59
+ """Load a fresh instance of the model and tokenizer"""
60
  try:
61
+ # Clear GPU memory if using CUDA
62
  if DEVICE == "cuda":
63
  torch.cuda.empty_cache()
64
  gc.collect()
 
70
  torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
71
  device_map='auto'
72
  )
73
+
74
+ tokenizer = AutoTokenizer.from_pretrained(
75
+ model_path,
76
+ trust_remote_code=True
77
+ )
78
  model.eval()
79
  processor = model.init_processor(tokenizer)
80
  return model, tokenizer, processor
 
82
  print(f"Error loading model: {str(e)}")
83
  raise
84
 
 
 
 
 
85
  def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
86
+ """Process a chunk of video frames with mPLUG model"""
87
+ messages = [
88
+ {
89
+ "role": "user",
90
+ "content": prompt,
91
+ "video_frames": video_frames
92
+ }
93
+ ]
94
 
95
  model_messages = []
96
  videos = []
97
+
98
  for msg in messages:
99
  content_str = msg["content"]
100
+ if "video_frames" in msg and msg["video_frames"]:
101
  content_str += "<|video|>"
102
  videos.append(msg["video_frames"])
103
+ model_messages.append({
104
+ "role": msg["role"],
105
+ "content": content_str
106
+ })
107
+
108
+ model_messages.append({
109
+ "role": "assistant",
110
+ "content": ""
111
+ })
112
+
113
+ inputs = processor(
114
+ model_messages,
115
+ images=None,
116
+ videos=videos if videos else None
117
+ )
118
  inputs.to('cuda')
119
  inputs.update({
120
  'tokenizer': tokenizer,
121
  'max_new_tokens': 100,
122
  'decode_text': True,
123
  })
124
+
125
  response = model.generate(**inputs)
126
  return response[0]
127
 
128
  def encode_video_in_chunks(video_path):
129
+ """Extract frames from a video in chunks"""
130
  vr = VideoReader(video_path, ctx=cpu(0))
131
+ sample_fps = round(vr.get_avg_fps() / 1) # 1 FPS
132
  frame_idx = [i for i in range(0, len(vr), sample_fps)]
133
+
134
+ # Split frame indices into chunks
135
+ chunks = [
136
+ frame_idx[i:i + MAX_NUM_FRAMES]
137
+ for i in range(0, len(frame_idx), MAX_NUM_FRAMES)
138
+ ]
139
+
140
  for chunk_idx, chunk in enumerate(chunks):
141
  frames = vr.get_batch(chunk).asnumpy()
142
  frames = [Image.fromarray(v.astype('uint8')) for v in frames]
143
  yield chunk_idx, frames
144
 
145
  def detect_people_and_machinery(media_path):
146
+ """Detect people and machinery using YOLOv11 for both images and videos"""
147
  try:
148
+ # Initialize counters with maximum values
149
  max_people_count = 0
150
  max_machine_types = {
151
  "Tower Crane": 0,
 
162
  "Other Vehicle": 0
163
  }
164
 
165
+ # Check if input is video
166
  if isinstance(media_path, str) and is_video(media_path):
167
  cap = cv2.VideoCapture(media_path)
168
  fps = cap.get(cv2.CAP_PROP_FPS)
169
+ sample_rate = max(1, int(fps)) # Sample 1 frame per second
170
+ frame_count = 0 # Initialize frame counter
171
+
172
  while cap.isOpened():
173
  ret, frame = cap.read()
174
  if not ret:
175
  break
176
+
177
+ # Process every nth frame based on sample rate
178
  if frame_count % sample_rate == 0:
179
  results = YOLO_MODEL(frame)
180
  people, _, machine_types = process_yolo_results(results)
181
+
182
+ # Update maximum counts
183
  max_people_count = max(max_people_count, people)
184
  for k, v in machine_types.items():
185
  max_machine_types[k] = max(max_machine_types[k], v)
186
+
187
  frame_count += 1
188
+
189
  cap.release()
190
+
191
  else:
192
+ # Handle single image
193
+ if isinstance(media_path, str):
194
+ img = cv2.imread(media_path)
195
+ else:
196
+ # Handle PIL Image
197
+ img = cv2.cvtColor(np.array(media_path), cv2.COLOR_RGB2BGR)
198
+
199
  results = YOLO_MODEL(img)
200
  max_people_count, _, max_machine_types = process_yolo_results(results)
201
+
202
+ # Filter out machinery types with zero count
203
  max_machine_types = {k: v for k, v in max_machine_types.items() if v > 0}
204
  total_machinery_count = sum(max_machine_types.values())
205
+
206
  return max_people_count, total_machinery_count, max_machine_types
207
+
208
  except Exception as e:
209
  print(f"Error in YOLO detection: {str(e)}")
210
  return 0, 0, {}
211
 
212
  def process_yolo_results(results):
213
+ """Process YOLO detection results and count people and machinery"""
214
  people_count = 0
215
  machine_types = {
216
+ "Tower Crane": 0,
217
+ "Mobile Crane": 0,
218
+ "Compactor/Roller": 0,
219
+ "Bulldozer": 0,
220
+ "Excavator": 0,
221
+ "Dump Truck": 0,
222
+ "Concrete Mixer": 0,
223
+ "Loader": 0,
224
+ "Pump Truck": 0,
225
+ "Pile Driver": 0,
226
+ "Grader": 0,
227
+ "Other Vehicle": 0
228
  }
229
+
230
+ # Process detection results
231
  for r in results:
232
+ boxes = r.boxes
233
+ for box in boxes:
234
  cls = int(box.cls[0])
235
  conf = float(box.conf[0])
236
  class_name = YOLO_MODEL.names[cls]
237
+
238
+ # Count people (Worker class)
239
  if class_name.lower() == 'worker' and conf > 0.5:
240
  people_count += 1
241
+
242
+ # Map YOLO classes to machinery types
243
  machinery_mapping = {
244
  'tower_crane': "Tower Crane",
245
  'mobile_crane': "Mobile Crane",
 
257
  'grader': "Grader",
258
  'other_vehicle': "Other Vehicle"
259
  }
260
+
261
+ # Count machinery
262
  if conf > 0.5:
263
+ class_lower = class_name.lower()
264
  for key, value in machinery_mapping.items():
265
+ if key in class_lower:
266
  machine_types[value] += 1
267
  break
268
+
269
+ total_machinery = sum(machine_types.values())
270
+ return people_count, total_machinery, machine_types
271
 
272
  def analyze_video_activities(video_path):
273
+ """Analyze video using mPLUG model with chunking"""
274
  try:
275
  all_responses = []
276
+ chunk_generator = encode_video_in_chunks(video_path)
277
+
278
+ for chunk_idx, video_frames in chunk_generator:
279
+ # Load fresh model instance for each chunk
280
  model, tokenizer, processor = load_model_and_tokenizer()
281
+
282
+ # Process the chunk
283
+ prompt = "Analyze this construction site video chunk and describe the activities happening. Focus on construction activities, machinery usage, and worker actions."
284
  response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
285
  all_responses.append(f"Time period {chunk_idx + 1}:\n{response}")
286
+
287
+ # Clean up GPU memory
288
  del model, tokenizer, processor
289
  torch.cuda.empty_cache()
290
  gc.collect()
291
+
292
+ # Combine all responses
293
  return "\n\n".join(all_responses)
294
  except Exception as e:
295
  print(f"Error analyzing video: {str(e)}")
296
  return "Error analyzing video activities"
297
 
298
  def process_image(image_path, model, tokenizer, processor, prompt):
299
+ """Process single image with mPLUG model"""
300
  try:
301
  image = Image.open(image_path)
302
  messages = [{
 
304
  "content": prompt,
305
  "images": [image]
306
  }]
307
+
308
  model_messages = []
309
  images = []
310
+
311
  for msg in messages:
312
  content_str = msg["content"]
313
+ if "images" in msg and msg["images"]:
314
  content_str += "<|image|>"
315
  images.extend(msg["images"])
316
+ model_messages.append({
317
+ "role": msg["role"],
318
+ "content": content_str
319
+ })
320
+
321
+ model_messages.append({
322
+ "role": "assistant",
323
+ "content": ""
324
+ })
325
+
326
+ inputs = processor(
327
+ model_messages,
328
+ images=images,
329
+ videos=None
330
+ )
331
  inputs.to('cuda')
332
  inputs.update({
333
  'tokenizer': tokenizer,
334
  'max_new_tokens': 100,
335
  'decode_text': True,
336
  })
337
+
338
  response = model.generate(**inputs)
339
  return response[0]
340
  except Exception as e:
 
342
  return "Error processing image"
343
 
344
  def analyze_image_activities(image_path):
345
+ """Analyze image using mPLUG model"""
346
  try:
347
  model, tokenizer, processor = load_model_and_tokenizer()
348
+ prompt = "Analyze this construction site image and describe the activities happening. Focus on construction activities, machinery usage, and worker actions."
 
349
  response = process_image(image_path, model, tokenizer, processor, prompt)
350
+
351
  del model, tokenizer, processor
352
  if DEVICE == "cuda":
353
  torch.cuda.empty_cache()
354
  gc.collect()
355
+
356
  return response
357
  except Exception as e:
358
  print(f"Error analyzing image: {str(e)}")
359
  return "Error analyzing image activities"
360
 
361
+
362
+ # ------------------------------------------------------------------
363
+ # NEW: Function to annotate each frame with bounding boxes & counts
364
+ # ------------------------------------------------------------------
365
  def annotate_video_with_bboxes(video_path):
366
+ """
367
+ Reads the entire video frame-by-frame, runs YOLO, draws bounding boxes,
368
+ writes a per-frame summary of detected classes on the frame, and saves
369
+ as a new annotated video. Returns: annotated_video_path
370
+ """
371
  cap = cv2.VideoCapture(video_path)
372
  fps = cap.get(cv2.CAP_PROP_FPS)
373
  w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
374
  h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
375
+
376
+ # Create a temp file for output
377
  out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
378
  annotated_video_path = out_file.name
379
  out_file.close()
380
+
381
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')
382
  writer = cv2.VideoWriter(annotated_video_path, fourcc, fps, (w, h))
383
+
384
  while True:
385
  ret, frame = cap.read()
386
  if not ret:
387
  break
388
+
389
  results = YOLO_MODEL(frame)
390
+
391
+ # Dictionary to hold per-frame counts of each class
392
  frame_counts = {}
393
+
394
  for r in results:
395
+ boxes = r.boxes
396
+ for box in boxes:
397
  cls_id = int(box.cls[0])
398
  conf = float(box.conf[0])
399
  if conf < 0.5:
400
+ continue # Skip low-confidence
401
+
402
  x1, y1, x2, y2 = box.xyxy[0]
403
  class_name = YOLO_MODEL.names[cls_id]
404
+
405
+ # Convert to int
406
  x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
407
+
408
+ # Draw bounding box
409
+ color = (0, 255, 0)
410
+ cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
411
+
412
  label_text = f"{class_name} {conf:.2f}"
413
  cv2.putText(frame, label_text, (x1, y1 - 6),
414
  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
415
+
416
+ # Increment per-frame class count
417
  frame_counts[class_name] = frame_counts.get(class_name, 0) + 1
418
+
419
+ # Build a summary line, e.g. "Worker: 2, Excavator: 1, ..."
420
+ summary_str = ", ".join(f"{cls_name}: {count}"
421
+ for cls_name, count in frame_counts.items())
422
+
423
+ # Put the summary text in the top-left
424
+ cv2.putText(
425
+ frame,
426
+ summary_str,
427
+ (15, 30), # position
428
+ cv2.FONT_HERSHEY_SIMPLEX,
429
+ 1.0,
430
+ (255, 255, 0),
431
+ 2
432
+ )
433
+
434
  writer.write(frame)
435
+
436
  cap.release()
437
  writer.release()
438
  return annotated_video_path
439
 
440
+
441
+
442
+ # ----------------------------------------------------------------------------
443
+ # Update process_diary function to also return an annotated video if it's video
444
+ # ----------------------------------------------------------------------------
445
  def process_diary(day, date, total_people, total_machinery, machinery_types, activities, media):
446
+ """Process the site diary entry"""
447
  if media is None:
448
+ # Return 6 text outputs as before + None for video
449
  return [day, date, "No media uploaded", "No media uploaded", "No media uploaded", "No media uploaded", None]
450
+
451
  try:
452
  if not hasattr(media, 'name'):
453
  raise ValueError("Invalid file upload")
454
+
455
  file_ext = get_file_extension(media.name)
456
  if not (is_image(media.name) or is_video(media.name)):
457
  raise ValueError(f"Unsupported file type: {file_ext}")
458
+
459
  with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
460
  temp_path = temp_file.name
461
  if hasattr(media, 'name') and os.path.exists(media.name):
 
464
  else:
465
  file_content = media.read() if hasattr(media, 'read') else media
466
  temp_file.write(file_content if isinstance(file_content, bytes) else file_content.read())
467
+
468
  detected_people, detected_machinery, detected_machinery_types = detect_people_and_machinery(temp_path)
469
+
470
+ # Default: no annotated video
471
  annotated_video_path = None
472
+
473
  if is_image(media.name):
474
+ # If it's an image, do normal image analysis
475
  detected_activities = analyze_image_activities(temp_path)
476
  else:
477
+ # If it's a video, do video analysis & also annotate the video
478
  detected_activities = analyze_video_activities(temp_path)
479
  annotated_video_path = annotate_video_with_bboxes(temp_path)
480
+
481
  if os.path.exists(temp_path):
482
  os.remove(temp_path)
483
+
484
  detected_types_str = ", ".join([f"{k}: {v}" for k, v in detected_machinery_types.items()])
485
+ # Return 7 outputs (the first 6 as before, plus the annotated video path)
486
  return [day, date, str(detected_people), str(detected_machinery), detected_types_str, detected_activities, annotated_video_path]
487
+
488
  except Exception as e:
489
  print(f"Error processing media: {str(e)}")
490
  return [day, date, "Error processing media", "Error processing media", "Error processing media", "Error processing media", None]
491
 
 
 
 
492
 
493
+ # Create the Gradio interface
494
  with gr.Blocks(title="Digital Site Diary") as demo:
495
  gr.Markdown("# 📝 Digital Site Diary")
496
+
497
  with gr.Row():
498
  # User Input Column
499
  with gr.Column():
500
  gr.Markdown("### User Input")
501
+ day = gr.Textbox(label="Day",value='9')
502
  date = gr.Textbox(label="Date", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
503
  total_people = gr.Number(label="Total Number of People", precision=0, value=10)
504
  total_machinery = gr.Number(label="Total Number of Machinery", precision=0, value=3)
 
515
  )
516
  media = gr.File(label="Upload Image/Video", file_types=["image", "video"])
517
  submit_btn = gr.Button("Submit", variant="primary")
518
+
519
  # Model Detection Column
520
  with gr.Column():
521
  gr.Markdown("### Model Detection")
 
525
  model_machinery = gr.Textbox(label="Total Number of Machinery")
526
  model_machinery_types = gr.Textbox(label="Number of Machinery Per Type")
527
  model_activities = gr.Textbox(label="Activity", lines=5)
528
+ # NEW: annotated video output
529
  model_annotated_video = gr.Video(label="Annotated Video")
530
+
531
+ # Connect the submit button to the processing function
532
  submit_btn.click(
533
  fn=process_diary,
534
  inputs=[day, date, total_people, total_machinery, machinery_types, activities, media],
535
+ outputs=[
536
+ model_day,
537
+ model_date,
538
+ model_people,
539
+ model_machinery,
540
+ model_machinery_types,
541
+ model_activities,
542
+ model_annotated_video # The new 7th output
543
+ ]
544
  )
545
 
546
  if __name__ == "__main__":
547
+ demo.launch(share=True) this is my code i want to deploy it on hugging face gradio