assentian1970 commited on
Commit
2855c1d
·
verified ·
1 Parent(s): 6c50846

Update image_captioning.py

Browse files
Files changed (1) hide show
  1. image_captioning.py +225 -80
image_captioning.py CHANGED
@@ -2,6 +2,7 @@ import torch
2
  from transformers import AutoModel, AutoTokenizer
3
  from modelscope.hub.snapshot_download import snapshot_download
4
  from PIL import Image
 
5
  from decord import VideoReader, cpu
6
  import os
7
  import gc
@@ -9,10 +10,11 @@ import cv2
9
  import tempfile
10
  import shutil
11
  import subprocess
 
12
  from yolo_detection import is_image, is_video
13
 
14
  # Constants for video processing
15
- MAX_NUM_FRAMES = 32
16
 
17
  # Check if CUDA is available
18
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
@@ -27,7 +29,7 @@ if DEVICE == "cuda":
27
 
28
  # Model configuration
29
  MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
30
- MODEL_CACHE_DIR = "/data/models"
31
  os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
32
 
33
  # Download and cache the model
@@ -39,8 +41,10 @@ except Exception as e:
39
 
40
 
41
  # Model configuration and existing functions remain unchanged...
 
42
  def load_model_and_tokenizer():
43
- """Load a fresh instance of the model and tokenizer"""
 
44
  try:
45
  # Clear GPU memory if using CUDA
46
  if DEVICE == "cuda":
@@ -66,6 +70,7 @@ def load_model_and_tokenizer():
66
  print(f"Error loading model: {str(e)}")
67
  raise
68
 
 
69
  def process_image(image_path, model, tokenizer, processor, prompt):
70
  """Process single image with mPLUG model"""
71
  try:
@@ -153,27 +158,168 @@ def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
153
  })
154
 
155
  response = model.generate(**inputs)
 
156
  return response[0]
157
 
158
  def split_original_video(video_path, chunk_info):
159
- """Split original video into chunks using precise timestamps"""
160
  original_chunks = []
 
161
  tmp_dir = os.path.join('.', 'tmp')
162
-
 
 
 
 
 
 
 
 
 
163
  for chunk in chunk_info:
164
- output_path = os.path.join(tmp_dir, f"original_chunk_{chunk['chunk_id']}.mp4")
165
- # Use ffmpeg for precise splitting without re-encoding
166
- cmd = [
167
- 'ffmpeg',
168
- '-ss', str(chunk['start_time']),
169
- '-to', str(chunk['end_time']),
170
- '-i', video_path,
171
- '-c', 'copy',
172
- output_path
173
- ]
174
- subprocess.run(cmd, check=True)
175
- original_chunks.append(output_path)
176
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  return original_chunks
178
 
179
  def encode_video_in_chunks(video_path):
@@ -303,71 +449,70 @@ def generate_thumbnails(video_path, num_chunks):
303
 
304
  return thumbnails
305
 
306
- def analyze_video_activities(video_path):
307
  """Analyze video using mPLUG model with chunking"""
308
  global TOTAL_CHUNKS
309
- try:
310
- # Existing chunk processing
311
- all_activities = []
312
- # Calculate total chunks first
313
- vr = VideoReader(video_path, ctx=cpu(0))
314
- sample_fps = round(vr.get_avg_fps() / 1)
315
- frame_idx = [i for i in range(0, len(vr), sample_fps)]
316
- TOTAL_CHUNKS = len([frame_idx[i:i + MAX_NUM_FRAMES]
317
- for i in range(0, len(frame_idx), MAX_NUM_FRAMES)])
 
 
 
 
 
 
 
 
 
 
 
318
 
319
- # Generate thumbnails with known chunk count
320
- thumbnails = generate_thumbnails(video_path, num_chunks=TOTAL_CHUNKS)
 
 
321
 
322
- # Now process chunks
323
- chunk_generator = encode_video_in_chunks(video_path)
324
- model, tokenizer, processor = load_model_and_tokenizer()
325
-
326
- for chunk_idx, video_frames, chunk_info in chunk_generator:
327
- prompt = "Analyze this construction site video chunk and describe the activities happening. Focus on construction activities, machinery usage, and worker actions. Include any construction equipment or machinery you can identify."
328
- response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
329
- print(f"Chunk {chunk_idx}: {response}")
330
 
331
- # Map responses to thumbnails
332
- time_start = chunk_idx * MAX_NUM_FRAMES
333
- chunk_thumbnails = [t for t in thumbnails
334
- if time_start <= t['time'] < time_start + MAX_NUM_FRAMES]
 
 
 
335
 
336
- # Extract time from frame position
337
- for thumbnail in chunk_thumbnails:
338
- # Calculate timestamp in minutes:seconds format
339
- seconds = int(thumbnail['time'])
340
- minutes = seconds // 60
341
- seconds = seconds % 60
342
- timestamp = f"{minutes:02d}:{seconds:02d}"
343
-
344
- # Extract objects using basic text parsing from the response
345
- # In a production system, you might want to use more sophisticated NLP
346
- objects = []
347
- lower_response = response.lower()
348
- possible_objects = ["excavator", "bulldozer", "crane", "truck", "loader",
349
- "worker", "concrete", "scaffold", "beam", "pipe",
350
- "rebar", "formwork", "drill", "grader", "roller"]
351
-
352
- for obj in possible_objects:
353
- if obj in lower_response:
354
- objects.append(obj)
355
-
356
- activity = {
357
- 'time': timestamp,
358
- 'timestamp_seconds': thumbnail['time'], # Store raw seconds for sorting
359
- 'summary': response,
360
- 'objects': objects,
361
- 'thumbnail': thumbnail["path"],
362
- 'chunk_id': chunk_idx,
363
- 'chunk_path': chunk_info['path'] if chunk_info else None
364
- }
365
-
366
- all_activities.append(activity)
367
-
368
- # Sort activities by timestamp
369
- all_activities.sort(key=lambda x: x['timestamp_seconds'])
370
- return all_activities
371
- except Exception as e:
372
- print(f"Error analyzing video: {str(e)}")
373
- return [] # Maintain consistent return type
 
2
  from transformers import AutoModel, AutoTokenizer
3
  from modelscope.hub.snapshot_download import snapshot_download
4
  from PIL import Image
5
+ from functools import lru_cache
6
  from decord import VideoReader, cpu
7
  import os
8
  import gc
 
10
  import tempfile
11
  import shutil
12
  import subprocess
13
+ import ffmpeg # Added for ffmpeg-python
14
  from yolo_detection import is_image, is_video
15
 
16
  # Constants for video processing
17
+ MAX_NUM_FRAMES = 32 # Reduced from 64 to potentially avoid OOM
18
 
19
  # Check if CUDA is available
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
29
 
30
  # Model configuration
31
  MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
32
+ MODEL_CACHE_DIR = "data/models"
33
  os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
34
 
35
  # Download and cache the model
 
41
 
42
 
43
  # Model configuration and existing functions remain unchanged...
44
+ @lru_cache(maxsize=1)
45
  def load_model_and_tokenizer():
46
+ """Load a cached instance of the model and tokenizer"""
47
+ print("Loading/Retrieving mPLUG model from cache...")
48
  try:
49
  # Clear GPU memory if using CUDA
50
  if DEVICE == "cuda":
 
70
  print(f"Error loading model: {str(e)}")
71
  raise
72
 
73
+
74
  def process_image(image_path, model, tokenizer, processor, prompt):
75
  """Process single image with mPLUG model"""
76
  try:
 
158
  })
159
 
160
  response = model.generate(**inputs)
161
+ del inputs
162
  return response[0]
163
 
164
  def split_original_video(video_path, chunk_info):
165
+ """Split original video into chunks using multiple methods with fallbacks for cross-platform reliability"""
166
  original_chunks = []
167
+ # Clean the ./tmp directory containing chunks/thumbnails
168
  tmp_dir = os.path.join('.', 'tmp')
169
+ if os.path.exists(tmp_dir):
170
+ try:
171
+ shutil.rmtree(tmp_dir)
172
+ os.makedirs(tmp_dir, exist_ok=True) # Recreate for next run
173
+ print(f"Cleaned up temporary directory: {tmp_dir}")
174
+ except OSError as e:
175
+ print(f"Error removing temporary directory {tmp_dir}: {e}")
176
+ else:
177
+ os.makedirs(tmp_dir)
178
+
179
  for chunk in chunk_info:
180
+ chunk_id = chunk['chunk_id']
181
+ start_time = chunk['start_time']
182
+ end_time = chunk['end_time']
183
+ output_path = os.path.join(tmp_dir, f"original_chunk_{chunk_id}.mp4")
184
+
185
+ # Try three different methods in order of preference
186
+ chunk_created = False
187
+
188
+ # Method 1: Try ffmpeg-python library
189
+ if not chunk_created:
190
+ try:
191
+ (
192
+ ffmpeg
193
+ .input(video_path, ss=start_time, to=end_time)
194
+ .output(output_path, c='copy', loglevel="quiet") # Added loglevel quiet
195
+ .run(capture_stdout=True, capture_stderr=True)
196
+ )
197
+ # Check if file exists and is not empty after ffmpeg-python call
198
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
199
+ chunk_created = True
200
+ print(f"Successfully created chunk {chunk_id} using ffmpeg-python")
201
+ else:
202
+ print(f"ffmpeg-python ran but did not create a valid file for chunk {chunk_id}")
203
+ # Optionally raise an exception here if needed, or just let it proceed to next method
204
+ except ffmpeg.Error as e: # Catch specific ffmpeg errors
205
+ print(f"ffmpeg-python error for chunk {chunk_id}: {e.stderr.decode() if e.stderr else str(e)}, trying OpenCV method")
206
+ except Exception as e: # Catch other potential errors like file not found
207
+ print(f"ffmpeg-python failed with general error for chunk {chunk_id}: {str(e)}, trying OpenCV method")
208
+
209
+ # Method 2: Try OpenCV for video splitting (re-encoding)
210
+ if not chunk_created:
211
+ try:
212
+ cap = cv2.VideoCapture(video_path)
213
+ if not cap.isOpened():
214
+ raise IOError(f"Cannot open video file: {video_path}")
215
+
216
+ fps = cap.get(cv2.CAP_PROP_FPS)
217
+ if fps <= 0: # Handle case where fps is invalid
218
+ print(f"Warning: Invalid FPS ({fps}) detected for {video_path}. Using default 30.")
219
+ fps = 30.0
220
+
221
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
222
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
223
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
224
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
225
+
226
+ # Calculate frame positions
227
+ start_frame = int(start_time * fps)
228
+ end_frame = int(end_time * fps)
229
+
230
+ # Set position to start frame
231
+ cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
232
+ current_frame = start_frame
233
+
234
+ while current_frame < end_frame:
235
+ ret, frame = cap.read()
236
+ if not ret:
237
+ print(f"Warning: Could not read frame {current_frame} for chunk {chunk_id}. Reached end of video early?")
238
+ break # Stop if we can't read a frame
239
+ out.write(frame)
240
+ current_frame += 1
241
+
242
+ cap.release()
243
+ out.release()
244
+
245
+ # Check if file exists and is not empty after OpenCV call
246
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
247
+ chunk_created = True
248
+ print(f"Successfully created chunk {chunk_id} using OpenCV")
249
+ else:
250
+ print(f"OpenCV method ran but did not create a valid file for chunk {chunk_id}")
251
+
252
+ except Exception as e:
253
+ print(f"OpenCV method failed for chunk {chunk_id}: {str(e)}, trying subprocess method")
254
+ # Clean up potentially empty file created by OpenCV on error
255
+ if os.path.exists(output_path):
256
+ try:
257
+ os.remove(output_path)
258
+ except OSError:
259
+ pass # Ignore cleanup error
260
+
261
+ # Method 3: Last resort - Try subprocess with better error handling
262
+ if not chunk_created:
263
+ try:
264
+ cmd = [
265
+ 'ffmpeg',
266
+ '-ss', str(start_time),
267
+ '-to', str(end_time),
268
+ '-i', video_path,
269
+ '-c', 'copy', # Attempt copy first
270
+ '-loglevel', 'error', # Reduce log noise
271
+ output_path
272
+ ]
273
+
274
+ process = subprocess.run(cmd, capture_output=True, text=True, check=False) # Don't check=True initially
275
+
276
+ if process.returncode != 0 or not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
277
+ print(f"Subprocess ffmpeg copy failed for chunk {chunk_id}. Stderr: {process.stderr}. Trying re-encoding.")
278
+ # If copy fails, try re-encoding as a fallback within subprocess
279
+ cmd_reencode = [
280
+ 'ffmpeg',
281
+ '-ss', str(start_time),
282
+ '-to', str(end_time),
283
+ '-i', video_path,
284
+ # '-c:v', 'libx264', # Example re-encode, adjust as needed
285
+ # '-crf', '23',
286
+ # '-c:a', 'aac',
287
+ '-loglevel', 'error',
288
+ output_path
289
+ ]
290
+ # Ensure overwrite if previous attempt created an empty file
291
+ if os.path.exists(output_path):
292
+ cmd_reencode.insert(1, '-y') # Add overwrite flag
293
+
294
+ process_reencode = subprocess.run(cmd_reencode, capture_output=True, text=True, check=False)
295
+
296
+ if process_reencode.returncode != 0:
297
+ raise Exception(f"Subprocess ffmpeg re-encode also failed. Stderr: {process_reencode.stderr}")
298
+
299
+ # Final check after subprocess attempts
300
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
301
+ chunk_created = True
302
+ print(f"Successfully created chunk {chunk_id} using subprocess ffmpeg")
303
+ else:
304
+ raise Exception("Subprocess ffmpeg failed to create a valid file.")
305
+
306
+ except FileNotFoundError:
307
+ print(f"Subprocess failed for chunk {chunk_id}: 'ffmpeg' command not found. Ensure ffmpeg is installed and in PATH.")
308
+ except Exception as e:
309
+ print(f"Subprocess method failed for chunk {chunk_id}: {str(e)}")
310
+ # Clean up potentially empty file
311
+ if os.path.exists(output_path):
312
+ try:
313
+ os.remove(output_path)
314
+ except OSError:
315
+ pass
316
+
317
+ # If any method succeeded, add the chunk to our list
318
+ if chunk_created and os.path.exists(output_path):
319
+ original_chunks.append(output_path)
320
+ else:
321
+ print(f"Warning: Failed to create chunk {chunk_id} using all methods, skipping.")
322
+
323
  return original_chunks
324
 
325
  def encode_video_in_chunks(video_path):
 
449
 
450
  return thumbnails
451
 
452
+ def analyze_video_activities(video_path, model, tokenizer, processor):
453
  """Analyze video using mPLUG model with chunking"""
454
  global TOTAL_CHUNKS
455
+ # try:
456
+ # Existing chunk processing
457
+ all_activities = []
458
+ # Calculate total chunks first
459
+ vr = VideoReader(video_path, ctx=cpu(0))
460
+ sample_fps = round(vr.get_avg_fps() / 1)
461
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
462
+ TOTAL_CHUNKS = len([frame_idx[i:i + MAX_NUM_FRAMES]
463
+ for i in range(0, len(frame_idx), MAX_NUM_FRAMES)])
464
+
465
+ # Generate thumbnails with known chunk count
466
+ thumbnails = generate_thumbnails(video_path, num_chunks=TOTAL_CHUNKS)
467
+
468
+ # Now process chunks
469
+ chunk_generator = encode_video_in_chunks(video_path)
470
+
471
+ for chunk_idx, video_frames, chunk_info in chunk_generator:
472
+ prompt = "Analyze this construction site video chunk and describe the activities happening. Focus on construction activities, machinery usage, and worker actions. Include any construction equipment or machinery you can identify."
473
+ response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
474
+ print(f"Chunk {chunk_idx}: {response}")
475
 
476
+ # Map responses to thumbnails
477
+ time_start = chunk_idx * MAX_NUM_FRAMES
478
+ chunk_thumbnails = [t for t in thumbnails
479
+ if time_start <= t['time'] < time_start + MAX_NUM_FRAMES]
480
 
481
+ # Extract time from frame position
482
+ for thumbnail in chunk_thumbnails:
483
+ # Calculate timestamp in minutes:seconds format
484
+ seconds = int(thumbnail['time'])
485
+ minutes = seconds // 60
486
+ seconds = seconds % 60
487
+ timestamp = f"{minutes:02d}:{seconds:02d}"
 
488
 
489
+ # Extract objects using basic text parsing from the response
490
+ # In a production system, you might want to use more sophisticated NLP
491
+ objects = []
492
+ lower_response = response.lower()
493
+ possible_objects = ["excavator", "bulldozer", "crane", "truck", "loader",
494
+ "worker", "concrete", "scaffold", "beam", "pipe",
495
+ "rebar", "formwork", "drill", "grader", "roller"]
496
 
497
+ for obj in possible_objects:
498
+ if obj in lower_response:
499
+ objects.append(obj)
500
+
501
+ activity = {
502
+ 'time': timestamp,
503
+ 'timestamp_seconds': thumbnail['time'], # Store raw seconds for sorting
504
+ 'summary': response,
505
+ 'objects': objects,
506
+ 'thumbnail': thumbnail["path"],
507
+ 'chunk_id': chunk_idx,
508
+ 'chunk_path': chunk_info['path'] if chunk_info else None
509
+ }
510
+
511
+ all_activities.append(activity)
512
+
513
+ # Sort activities by timestamp
514
+ all_activities.sort(key=lambda x: x['timestamp_seconds'])
515
+ return all_activities
516
+ # except Exception as e:
517
+ # print(f"Error analyzing video: {str(e)}")
518
+ # return [] # Maintain consistent return type