assentian1970 commited on
Commit
f3b123c
·
verified ·
1 Parent(s): 32ceeb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +205 -443
app.py CHANGED
@@ -1,38 +1,36 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- #import spaces
5
- #import torch
6
-
7
- # GPU initialization using Spaces decorator
8
- @spaces.GPU
9
- def debug():
10
- torch.randn(10).cuda()
11
- debug()
12
-
13
- from datetime import datetime
14
  import torch
 
15
  from transformers import AutoModel, AutoTokenizer
16
- from modelscope.hub.snapshot_download import snapshot_download
17
  from PIL import Image
18
  from decord import VideoReader, cpu
19
  import os
20
  import gc
21
- import random
22
- import io
23
  import tempfile
24
  from ultralytics import YOLO
25
  import numpy as np
26
  import cv2
27
- import gradio as gr
28
 
29
- # Set device for torch computations
30
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # Load YOLOv11 model (remove device argument because it's not supported)
33
- YOLO_MODEL = YOLO('best_yolov11.pt')
34
 
35
- # File type validation
36
  IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
37
  VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
38
 
@@ -45,507 +43,271 @@ def is_image(filename):
45
  def is_video(filename):
46
  return get_file_extension(filename) in VIDEO_EXTENSIONS
47
 
48
- # Model configuration
49
- MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
50
- MODEL_CACHE_DIR = os.getenv('TRANSFORMERS_CACHE', './models')
51
-
52
- # Create cache directory if it doesn't exist
53
- os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
54
-
55
- # Download and cache the model
56
- try:
57
- model_path = snapshot_download(MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
58
- except Exception as e:
59
- print(f"Error downloading model: {str(e)}")
60
- model_path = os.path.join(MODEL_CACHE_DIR, MODEL_NAME)
61
-
62
- MAX_NUM_FRAMES = 32
63
-
64
  def load_model_and_tokenizer():
65
- """Load a fresh instance of the model and tokenizer"""
66
  try:
67
- # Clear GPU memory if using CUDA
68
- if DEVICE == "cuda":
69
- torch.cuda.empty_cache()
70
- gc.collect()
71
 
72
  model = AutoModel.from_pretrained(
73
- model_path,
74
  attn_implementation='sdpa',
75
  trust_remote_code=True,
76
- torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
77
- device_map='auto'
 
78
  )
79
 
80
  tokenizer = AutoTokenizer.from_pretrained(
81
- model_path,
82
  trust_remote_code=True
83
  )
84
- model.eval()
85
  processor = model.init_processor(tokenizer)
 
86
  return model, tokenizer, processor
87
  except Exception as e:
88
- print(f"Error loading model: {str(e)}")
89
  raise
90
 
91
- #def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
92
- """Process a chunk of video frames with mPLUG model"""
93
- messages = [
94
- {
95
- "role": "user",
96
- "content": prompt,
97
- "video_frames": video_frames
98
- }
99
- ]
100
-
101
- model_messages = []
102
- videos = []
103
-
104
- for msg in messages:
105
- content_str = msg["content"]
106
- if "video_frames" in msg and msg["video_frames"]:
107
- content_str += "<|video|>"
108
- videos.append(msg["video_frames"])
109
- model_messages.append({
110
- "role": msg["role"],
111
- "content": content_str
112
- })
113
-
114
- model_messages.append({
115
- "role": "assistant",
116
- "content": ""
117
- })
118
-
119
- inputs = processor(
120
- model_messages,
121
- images=None,
122
- videos=videos if videos else None
123
- )
124
- # Use DEVICE variable so that CPU-only environments aren’t forced to cuda
125
- inputs.to(DEVICE)
126
- inputs.update({
127
- 'tokenizer': tokenizer,
128
- 'max_new_tokens': 100,
129
- 'decode_text': True,
130
- })
131
-
132
- response = model.generate(**inputs)
133
- return response[0]
134
-
135
- def encode_video_in_chunks(video_path):
136
- """Extract frames from a video in chunks"""
137
- vr = VideoReader(video_path, ctx=cpu(0))
138
- sample_fps = round(vr.get_avg_fps() / 1) # 1 FPS
139
- frame_idx = [i for i in range(0, len(vr), sample_fps)]
140
 
141
- # Split frame indices into chunks
142
- chunks = [
143
- frame_idx[i:i + MAX_NUM_FRAMES]
144
- for i in range(0, len(frame_idx), MAX_NUM_FRAMES)
145
- ]
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- for chunk_idx, chunk in enumerate(chunks):
148
- frames = vr.get_batch(chunk).asnumpy()
149
- frames = [Image.fromarray(v.astype('uint8')) for v in frames]
150
- yield chunk_idx, frames
151
 
 
152
  def detect_people_and_machinery(media_path):
153
- """Detect people and machinery using YOLOv11 for both images and videos"""
154
  try:
155
- # Initialize counters with maximum values
156
- max_people_count = 0
157
- max_machine_types = {
158
- "Tower Crane": 0,
159
- "Mobile Crane": 0,
160
- "Compactor/Roller": 0,
161
- "Bulldozer": 0,
162
- "Excavator": 0,
163
- "Dump Truck": 0,
164
- "Concrete Mixer": 0,
165
- "Loader": 0,
166
- "Pump Truck": 0,
167
- "Pile Driver": 0,
168
- "Grader": 0,
169
- "Other Vehicle": 0
170
- }
171
 
172
- # Check if input is video
173
  if isinstance(media_path, str) and is_video(media_path):
174
  cap = cv2.VideoCapture(media_path)
175
  fps = cap.get(cv2.CAP_PROP_FPS)
176
- sample_rate = max(1, int(fps)) # Sample 1 frame per second
177
- frame_count = 0 # Initialize frame counter
178
-
179
  while cap.isOpened():
180
  ret, frame = cap.read()
181
  if not ret:
182
  break
183
-
184
- # Process every nth frame based on sample rate
185
- if frame_count % sample_rate == 0:
186
- results = YOLO_MODEL(frame)
187
- people, _, machine_types = process_yolo_results(results)
188
-
189
- # Update maximum counts
190
- max_people_count = max(max_people_count, people)
191
- for k, v in machine_types.items():
192
- max_machine_types[k] = max(max_machine_types[k], v)
193
-
194
- frame_count += 1
195
-
196
  cap.release()
197
-
198
  else:
199
- # Handle single image
200
- if isinstance(media_path, str):
201
- img = cv2.imread(media_path)
202
- else:
203
- # Handle PIL Image
204
- img = cv2.cvtColor(np.array(media_path), cv2.COLOR_RGB2BGR)
205
-
206
  results = YOLO_MODEL(img)
207
- max_people_count, _, max_machine_types = process_yolo_results(results)
208
-
209
- # Filter out machinery types with zero count
210
- max_machine_types = {k: v for k, v in max_machine_types.items() if v > 0}
211
- total_machinery_count = sum(max_machine_types.values())
212
-
213
- return max_people_count, total_machinery_count, max_machine_types
214
 
 
 
 
215
  except Exception as e:
216
- print(f"Error in YOLO detection: {str(e)}")
217
  return 0, 0, {}
218
 
219
- def process_yolo_results(results):
220
- """Process YOLO detection results and count people and machinery"""
221
- people_count = 0
222
- machine_types = {
223
- "Tower Crane": 0,
224
- "Mobile Crane": 0,
225
- "Compactor/Roller": 0,
226
- "Bulldozer": 0,
227
- "Excavator": 0,
228
- "Dump Truck": 0,
229
- "Concrete Mixer": 0,
230
- "Loader": 0,
231
- "Pump Truck": 0,
232
- "Pile Driver": 0,
233
- "Grader": 0,
234
- "Other Vehicle": 0
235
- }
236
-
237
- # Process detection results
238
- for r in results:
239
- boxes = r.boxes
240
- for box in boxes:
241
- cls = int(box.cls[0])
242
- conf = float(box.conf[0])
243
- class_name = YOLO_MODEL.names[cls]
244
-
245
- # Count people (Worker class)
246
- if class_name.lower() == 'worker' and conf > 0.5:
247
- people_count += 1
248
-
249
- # Map YOLO classes to machinery types
250
- machinery_mapping = {
251
- 'tower_crane': "Tower Crane",
252
- 'mobile_crane': "Mobile Crane",
253
- 'compactor': "Compactor/Roller",
254
- 'roller': "Compactor/Roller",
255
- 'bulldozer': "Bulldozer",
256
- 'dozer': "Bulldozer",
257
- 'excavator': "Excavator",
258
- 'dump_truck': "Dump Truck",
259
- 'truck': "Dump Truck",
260
- 'concrete_mixer_truck': "Concrete Mixer",
261
- 'loader': "Loader",
262
- 'pump_truck': "Pump Truck",
263
- 'pile_driver': "Pile Driver",
264
- 'grader': "Grader",
265
- 'other_vehicle': "Other Vehicle"
266
- }
267
-
268
- # Count machinery
269
- if conf > 0.5:
270
- class_lower = class_name.lower()
271
- for key, value in machinery_mapping.items():
272
- if key in class_lower:
273
- machine_types[value] += 1
274
- break
275
-
276
- total_machinery = sum(machine_types.values())
277
- return people_count, total_machinery, machine_types
278
-
279
  def analyze_video_activities(video_path):
280
- """Analyze video using mPLUG model with chunking"""
281
  try:
282
- all_responses = []
283
- chunk_generator = encode_video_in_chunks(video_path)
284
 
285
- for chunk_idx, video_frames in chunk_generator:
286
- # Load fresh model instance for each chunk
287
- model, tokenizer, processor = load_model_and_tokenizer()
 
 
 
 
 
 
 
 
288
 
289
- # Process the chunk
290
- prompt = "Analyze this construction site video chunk and describe the activities happening. Focus on construction activities, machinery usage, and worker actions."
291
- response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
292
- all_responses.append(f"Time period {chunk_idx + 1}:\n{response}")
293
 
294
- # Clean up GPU memory
295
- del model, tokenizer, processor
296
- torch.cuda.empty_cache()
297
- gc.collect()
298
-
299
- # Combine all responses
300
- return "\n\n".join(all_responses)
301
- except Exception as e:
302
- print(f"Error analyzing video: {str(e)}")
303
- return "Error analyzing video activities"
304
-
305
- def process_image(image_path, model, tokenizer, processor, prompt):
306
- """Process single image with mPLUG model"""
307
- try:
308
- image = Image.open(image_path)
309
- messages = [{
310
- "role": "user",
311
- "content": prompt,
312
- "images": [image]
313
- }]
314
-
315
- model_messages = []
316
- images = []
317
-
318
- for msg in messages:
319
- content_str = msg["content"]
320
- if "images" in msg and msg["images"]:
321
- content_str += "<|image|>"
322
- images.extend(msg["images"])
323
- model_messages.append({
324
- "role": msg["role"],
325
- "content": content_str
326
- })
327
-
328
- model_messages.append({
329
- "role": "assistant",
330
- "content": ""
331
- })
332
-
333
- inputs = processor(
334
- model_messages,
335
- images=images,
336
- videos=None
337
- )
338
- # Use the DEVICE variable for transferring inputs
339
- inputs.to(DEVICE)
340
- inputs.update({
341
- 'tokenizer': tokenizer,
342
- 'max_new_tokens': 100,
343
- 'decode_text': True,
344
- })
345
-
346
- response = model.generate(**inputs)
347
- return response[0]
348
  except Exception as e:
349
- print(f"Error processing image: {str(e)}")
350
- return "Error processing image"
351
 
 
352
  def analyze_image_activities(image_path):
353
- """Analyze image using mPLUG model"""
354
  try:
355
  model, tokenizer, processor = load_model_and_tokenizer()
356
- prompt = "Analyze this construction site image and describe the activities happening. Focus on construction activities, machinery usage, and worker actions."
357
- response = process_image(image_path, model, tokenizer, processor, prompt)
 
 
 
 
358
 
 
359
  del model, tokenizer, processor
360
- if DEVICE == "cuda":
361
- torch.cuda.empty_cache()
362
- gc.collect()
363
-
364
- return response
365
  except Exception as e:
366
- print(f"Error analyzing image: {str(e)}")
367
- return "Error analyzing image activities"
368
 
369
- # ------------------------------------------------------------------
370
- # Function to annotate each frame with bounding boxes & counts
371
- # ------------------------------------------------------------------
372
  def annotate_video_with_bboxes(video_path):
373
- """
374
- Reads the entire video frame-by-frame, runs YOLO, draws bounding boxes,
375
- writes a per-frame summary of detected classes on the frame, and saves
376
- as a new annotated video. Returns: annotated_video_path
377
- """
378
  cap = cv2.VideoCapture(video_path)
379
  fps = cap.get(cv2.CAP_PROP_FPS)
380
- w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
381
- h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
382
-
383
- # Create a temp file for output
384
- out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
385
- annotated_video_path = out_file.name
386
- out_file.close()
387
 
388
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
389
- writer = cv2.VideoWriter(annotated_video_path, fourcc, fps, (w, h))
390
 
391
- while True:
392
  ret, frame = cap.read()
393
  if not ret:
394
  break
395
-
396
  results = YOLO_MODEL(frame)
397
-
398
- # Dictionary to hold per-frame counts of each class
399
- frame_counts = {}
400
-
401
  for r in results:
402
- boxes = r.boxes
403
- for box in boxes:
404
- cls_id = int(box.cls[0])
405
- conf = float(box.conf[0])
406
- if conf < 0.5:
407
- continue # Skip low-confidence
408
-
409
- x1, y1, x2, y2 = box.xyxy[0]
410
  class_name = YOLO_MODEL.names[cls_id]
411
-
412
- # Convert coordinates to int
413
- x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
414
-
415
  # Draw bounding box
416
- color = (0, 255, 0)
417
- cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
418
-
419
- label_text = f"{class_name} {conf:.2f}"
420
- cv2.putText(frame, label_text, (x1, y1 - 6),
421
- cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
422
-
423
- # Increment per-frame class count
424
- frame_counts[class_name] = frame_counts.get(class_name, 0) + 1
425
-
426
- # Build a summary line, e.g. "Worker: 2, Excavator: 1, ..."
427
- summary_str = ", ".join(f"{cls_name}: {count}"
428
- for cls_name, count in frame_counts.items())
429
-
430
- # Put the summary text in the top-left
431
- cv2.putText(
432
- frame,
433
- summary_str,
434
- (15, 30), # position
435
- cv2.FONT_HERSHEY_SIMPLEX,
436
- 1.0,
437
- (255, 255, 0),
438
- 2
439
- )
440
-
441
  writer.write(frame)
442
-
443
  cap.release()
444
  writer.release()
445
- return annotated_video_path
446
-
447
- # ----------------------------------------------------------------------------
448
- # Update process_diary function to also return an annotated video if it's video
449
- # ----------------------------------------------------------------------------
450
- def process_diary(day, date, total_people, total_machinery, machinery_types, activities, media):
451
- """Process the site diary entry"""
452
- if media is None:
453
- # Return 6 text outputs as before plus None for video
454
- return [day, date, "No media uploaded", "No media uploaded", "No media uploaded", "No media uploaded", None]
455
 
 
 
456
  try:
457
- if not hasattr(media, 'name'):
458
- raise ValueError("Invalid file upload")
459
-
460
- file_ext = get_file_extension(media.name)
461
- if not (is_image(media.name) or is_video(media.name)):
462
- raise ValueError(f"Unsupported file type: {file_ext}")
463
-
464
- with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
465
- temp_path = temp_file.name
466
- if hasattr(media, 'name') and os.path.exists(media.name):
467
- with open(media.name, 'rb') as f:
468
- temp_file.write(f.read())
469
- else:
470
- file_content = media.read() if hasattr(media, 'read') else media
471
- temp_file.write(file_content if isinstance(file_content, bytes) else file_content.read())
472
-
473
- detected_people, detected_machinery, detected_machinery_types = detect_people_and_machinery(temp_path)
474
 
475
- # Default: no annotated video
476
- annotated_video_path = None
477
-
478
  if is_image(media.name):
479
- # If it's an image, do normal image analysis
480
- detected_activities = analyze_image_activities(temp_path)
481
  else:
482
- # If it's a video, do video analysis and also annotate the video
483
- detected_activities = analyze_video_activities(temp_path)
484
- annotated_video_path = annotate_video_with_bboxes(temp_path)
485
-
486
- if os.path.exists(temp_path):
487
- os.remove(temp_path)
488
-
489
- detected_types_str = ", ".join([f"{k}: {v}" for k, v in detected_machinery_types.items()])
490
- # Return 7 outputs (the first 6 as before, plus the annotated video path)
491
- return [day, date, str(detected_people), str(detected_machinery), detected_types_str, detected_activities, annotated_video_path]
492
-
 
 
 
493
  except Exception as e:
494
- print(f"Error processing media: {str(e)}")
495
- return [day, date, "Error processing media", "Error processing media", "Error processing media", "Error processing media", None]
496
 
497
- # Create the Gradio interface
498
- with gr.Blocks(title="Digital Site Diary") as demo:
499
- gr.Markdown("# 📝 Digital Site Diary")
500
 
501
  with gr.Row():
502
- # User Input Column
503
  with gr.Column():
504
- gr.Markdown("### User Input")
505
- day = gr.Textbox(label="Day", value='9')
506
- date = gr.Textbox(label="Date", placeholder="YYYY-MM-DD", value=datetime.now().strftime("%Y-%m-%d"))
507
- total_people = gr.Number(label="Total Number of People", precision=0, value=10)
508
- total_machinery = gr.Number(label="Total Number of Machinery", precision=0, value=3)
509
- machinery_types = gr.Textbox(
510
- label="Number of Machinery Per Type",
511
- placeholder="e.g., Excavator: 2, Roller: 1",
512
- value="Excavator: 2, Roller: 1"
513
- )
514
- activities = gr.Textbox(
515
- label="Activity",
516
- placeholder="e.g., 9 AM: Excavation, 10 AM: Concreting",
517
- value="9 AM: Excavation, 10 AM: Concreting",
518
- lines=3
519
- )
520
- media = gr.File(label="Upload Image/Video", file_types=["image", "video"])
521
- submit_btn = gr.Button("Submit", variant="primary")
522
-
523
- # Model Detection Column
524
  with gr.Column():
525
- gr.Markdown("### Model Detection")
526
  model_day = gr.Textbox(label="Day")
527
  model_date = gr.Textbox(label="Date")
528
- model_people = gr.Textbox(label="Total Number of People")
529
- model_machinery = gr.Textbox(label="Total Number of Machinery")
530
- model_machinery_types = gr.Textbox(label="Number of Machinery Per Type")
531
- model_activities = gr.Textbox(label="Activity", lines=5)
532
- # Annotated video output
533
- model_annotated_video = gr.Video(label="Annotated Video")
534
 
535
- # Connect the submit button to the processing function
536
  submit_btn.click(
537
- fn=process_diary,
538
- inputs=[day, date, total_people, total_machinery, machinery_types, activities, media],
539
- outputs=[
540
- model_day,
541
- model_date,
542
- model_people,
543
- model_machinery,
544
- model_machinery_types,
545
- model_activities,
546
- model_annotated_video # The new 7th output
547
- ]
548
  )
549
 
550
  if __name__ == "__main__":
551
- demo.launch(share=False)
 
1
+ import spaces
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torch
3
+ from datetime import datetime
4
  from transformers import AutoModel, AutoTokenizer
5
+ import gradio as gr
6
  from PIL import Image
7
  from decord import VideoReader, cpu
8
  import os
9
  import gc
 
 
10
  import tempfile
11
  from ultralytics import YOLO
12
  import numpy as np
13
  import cv2
14
+ from modelscope.hub.snapshot_download import snapshot_download
15
 
16
+ # Initialize GPU
17
+ @spaces.GPU
18
+ def initialize_gpu():
19
+ if torch.cuda.is_available():
20
+ torch.randn(10).cuda()
21
+ initialize_gpu()
22
+
23
+ # Load YOLO model
24
+ YOLO_MODEL = YOLO('best_yolov11.pt') # Keep this file in repo root
25
+
26
+ # Model configuration
27
+ MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
28
+ model_dir = snapshot_download(MODEL_NAME, cache_dir='./models')
29
 
30
+ # Device setup
31
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
 
33
+ # File validation
34
  IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
35
  VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'}
36
 
 
43
  def is_video(filename):
44
  return get_file_extension(filename) in VIDEO_EXTENSIONS
45
 
46
+ @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def load_model_and_tokenizer():
48
+ """Load 4-bit quantized model for memory efficiency"""
49
  try:
50
+ torch.cuda.empty_cache()
51
+ gc.collect()
 
 
52
 
53
  model = AutoModel.from_pretrained(
54
+ model_dir,
55
  attn_implementation='sdpa',
56
  trust_remote_code=True,
57
+ load_in_4bit=True,
58
+ device_map="auto",
59
+ torch_dtype=torch.bfloat16
60
  )
61
 
62
  tokenizer = AutoTokenizer.from_pretrained(
63
+ model_dir,
64
  trust_remote_code=True
65
  )
 
66
  processor = model.init_processor(tokenizer)
67
+ model.eval()
68
  return model, tokenizer, processor
69
  except Exception as e:
70
+ print(f"Model loading error: {str(e)}")
71
  raise
72
 
73
+ def process_yolo_results(results):
74
+ """Process YOLO detection results"""
75
+ machinery_mapping = {
76
+ 'tower_crane': "Tower Crane",
77
+ 'mobile_crane': "Mobile Crane",
78
+ 'compactor': "Compactor/Roller",
79
+ 'roller': "Compactor/Roller",
80
+ 'bulldozer': "Bulldozer",
81
+ 'dozer': "Bulldozer",
82
+ 'excavator': "Excavator",
83
+ 'dump_truck': "Dump Truck",
84
+ 'truck': "Dump Truck",
85
+ 'concrete_mixer_truck': "Concrete Mixer",
86
+ 'loader': "Loader",
87
+ 'pump_truck': "Pump Truck",
88
+ 'pile_driver': "Pile Driver",
89
+ 'grader': "Grader",
90
+ 'other_vehicle': "Other Vehicle"
91
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ counts = {"Worker": 0, **{v: 0 for v in machinery_mapping.values()}}
94
+
95
+ for r in results:
96
+ for box in r.boxes:
97
+ if box.conf.item() < 0.5:
98
+ continue
99
+
100
+ cls_name = YOLO_MODEL.names[int(box.cls.item())].lower()
101
+ if cls_name == 'worker':
102
+ counts["Worker"] += 1
103
+ continue
104
+
105
+ for key, value in machinery_mapping.items():
106
+ if key in cls_name:
107
+ counts[value] += 1
108
+ break
109
 
110
+ return counts["Worker"], sum(counts.values()) - counts["Worker"], counts
 
 
 
111
 
112
+ @spaces.GPU
113
  def detect_people_and_machinery(media_path):
114
+ """GPU-accelerated detection"""
115
  try:
116
+ max_people = 0
117
+ max_machines = {k: 0 for k in [
118
+ "Tower Crane", "Mobile Crane", "Compactor/Roller", "Bulldozer",
119
+ "Excavator", "Dump Truck", "Concrete Mixer", "Loader",
120
+ "Pump Truck", "Pile Driver", "Grader", "Other Vehicle"
121
+ ]}
 
 
 
 
 
 
 
 
 
 
122
 
 
123
  if isinstance(media_path, str) and is_video(media_path):
124
  cap = cv2.VideoCapture(media_path)
125
  fps = cap.get(cv2.CAP_PROP_FPS)
126
+ sample_rate = max(1, int(fps))
127
+
 
128
  while cap.isOpened():
129
  ret, frame = cap.read()
130
  if not ret:
131
  break
132
+
133
+ results = YOLO_MODEL(frame)
134
+ people, machines, types = process_yolo_results(results)
135
+
136
+ max_people = max(max_people, people)
137
+ for k in max_machines:
138
+ max_machines[k] = max(max_machines[k], types.get(k, 0))
139
+
 
 
 
 
 
140
  cap.release()
 
141
  else:
142
+ img = cv2.imread(media_path) if isinstance(media_path, str) else cv2.cvtColor(np.array(media_path), cv2.COLOR_RGB2BGR)
 
 
 
 
 
 
143
  results = YOLO_MODEL(img)
144
+ max_people, _, types = process_yolo_results(results)
145
+ for k in max_machines:
146
+ max_machines[k] = types.get(k, 0)
 
 
 
 
147
 
148
+ filtered = {k: v for k, v in max_machines.items() if v > 0}
149
+ return max_people, sum(filtered.values()), filtered
150
+
151
  except Exception as e:
152
+ print(f"Detection error: {str(e)}")
153
  return 0, 0, {}
154
 
155
+ @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def analyze_video_activities(video_path):
157
+ """Video analysis with chunk processing"""
158
  try:
159
+ model, tokenizer, processor = load_model_and_tokenizer()
160
+ responses = []
161
 
162
+ vr = VideoReader(video_path, ctx=cpu(0))
163
+ frame_step = max(1, int(vr.get_avg_fps()))
164
+ frames = [Image.fromarray(f.asnumpy()) for f in vr[::frame_step]]
165
+
166
+ # Process in chunks
167
+ for i in range(0, len(frames), 16):
168
+ chunk = frames[i:i+16]
169
+ inputs = processor(
170
+ [{"role": "user", "content": "Analyze construction activities", "video_frames": chunk}],
171
+ videos=[chunk]
172
+ ).to(DEVICE)
173
 
174
+ response = model.generate(**inputs, max_new_tokens=200)
175
+ responses.append(response[0])
 
 
176
 
177
+ del model, tokenizer, processor
178
+ torch.cuda.empty_cache()
179
+ return "\n".join(responses)
180
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  except Exception as e:
182
+ print(f"Video analysis error: {str(e)}")
183
+ return "Activity analysis unavailable"
184
 
185
+ @spaces.GPU
186
  def analyze_image_activities(image_path):
187
+ """Image analysis pipeline"""
188
  try:
189
  model, tokenizer, processor = load_model_and_tokenizer()
190
+ image = Image.open(image_path).convert("RGB")
191
+
192
+ inputs = processor(
193
+ [{"role": "user", "content": "Analyze construction site", "images": [image]}],
194
+ images=[image]
195
+ ).to(DEVICE)
196
 
197
+ response = model.generate(**inputs, max_new_tokens=200)
198
  del model, tokenizer, processor
199
+ return response[0]
200
+
 
 
 
201
  except Exception as e:
202
+ print(f"Image analysis error: {str(e)}")
203
+ return "Activity analysis unavailable"
204
 
205
+ @spaces.GPU
 
 
206
  def annotate_video_with_bboxes(video_path):
207
+ """Video annotation with real-time detection"""
 
 
 
 
208
  cap = cv2.VideoCapture(video_path)
209
  fps = cap.get(cv2.CAP_PROP_FPS)
210
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
211
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
 
 
 
 
 
212
 
213
+ temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
214
+ writer = cv2.VideoWriter(temp_file.name, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
215
 
216
+ while cap.isOpened():
217
  ret, frame = cap.read()
218
  if not ret:
219
  break
220
+
221
  results = YOLO_MODEL(frame)
222
+ counts = {}
223
+
 
 
224
  for r in results:
225
+ for box in r.boxes:
226
+ if box.conf.item() < 0.5:
227
+ continue
228
+
229
+ cls_id = int(box.cls.item())
 
 
 
230
  class_name = YOLO_MODEL.names[cls_id]
231
+ counts[class_name] = counts.get(class_name, 0) + 1
232
+
 
 
233
  # Draw bounding box
234
+ x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
235
+ cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
236
+ cv2.putText(frame, f"{class_name} {box.conf.item():.2f}",
237
+ (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 1)
238
+
239
+ # Add summary text
240
+ summary = ", ".join([f"{k}:{v}" for k,v in counts.items()])
241
+ cv2.putText(frame, summary, (10,30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,255), 2)
242
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  writer.write(frame)
244
+
245
  cap.release()
246
  writer.release()
247
+ return temp_file.name
 
 
 
 
 
 
 
 
 
248
 
249
+ def process_diary(day, date, people, machinery, machinery_types, activities, media):
250
+ """Main processing pipeline"""
251
  try:
252
+ if not media:
253
+ return [day, date, "No data", "No data", "No data", "No data", None]
254
+
255
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
256
+ tmp.write(media.read())
257
+ media_path = tmp.name
258
+
259
+ detected_people, detected_machinery, machine_types = detect_people_and_machinery(media_path)
260
+ annotated_video = None
 
 
 
 
 
 
 
 
261
 
 
 
 
262
  if is_image(media.name):
263
+ activities = analyze_image_activities(media_path)
 
264
  else:
265
+ activities = analyze_video_activities(media_path)
266
+ annotated_video = annotate_video_with_bboxes(media_path)
267
+
268
+ os.remove(media_path)
269
+ return [
270
+ day,
271
+ date,
272
+ str(detected_people),
273
+ str(detected_machinery),
274
+ ", ".join([f"{k}:{v}" for k,v in machine_types.items()]),
275
+ activities,
276
+ annotated_video
277
+ ]
278
+
279
  except Exception as e:
280
+ print(f"Processing error: {str(e)}")
281
+ return [day, date, "Error", "Error", "Error", "Error", None]
282
 
283
+ # Gradio Interface
284
+ with gr.Blocks(title="Digital Site Diary", css="video {height: auto !important;}") as demo:
285
+ gr.Markdown("# 🏗️ Digital Construction Diary")
286
 
287
  with gr.Row():
 
288
  with gr.Column():
289
+ gr.Markdown("### Site Details")
290
+ day = gr.Textbox(label="Day Number", value="1")
291
+ date = gr.Textbox(label="Date", value=datetime.now().strftime("%Y-%m-%d"))
292
+ media = gr.File(label="Upload Media", file_types=["image", "video"])
293
+ submit_btn = gr.Button("Generate Report", variant="primary")
294
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  with gr.Column():
296
+ gr.Markdown("### Safety Report")
297
  model_day = gr.Textbox(label="Day")
298
  model_date = gr.Textbox(label="Date")
299
+ model_people = gr.Textbox(label="Worker Count")
300
+ model_machinery = gr.Textbox(label="Machinery Count")
301
+ model_machinery_types = gr.Textbox(label="Machinery Breakdown")
302
+ model_activities = gr.Textbox(label="Activity Analysis", lines=4)
303
+ model_video = gr.Video(label="Safety Annotations")
 
304
 
 
305
  submit_btn.click(
306
+ process_diary,
307
+ inputs=[day, date, None, None, None, None, media],
308
+ outputs=[model_day, model_date, model_people, model_machinery,
309
+ model_machinery_types, model_activities, model_video]
 
 
 
 
 
 
 
310
  )
311
 
312
  if __name__ == "__main__":
313
+ demo.launch(server_name="0.0.0.0", server_port=7860)