assentian1970 commited on
Commit
b08dda1
·
verified ·
1 Parent(s): efd5a3f

Create image_captioning.py

Browse files
Files changed (1) hide show
  1. image_captioning.py +380 -0
image_captioning.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModel, AutoTokenizer
3
+ from modelscope.hub.snapshot_download import snapshot_download
4
+ from PIL import Image
5
+ from decord import VideoReader, cpu
6
+ import os
7
+ import gc
8
+ import cv2
9
+ import tempfile
10
+ import shutil
11
+ import subprocess
12
+ from yolo_detection import is_image, is_video
13
+
14
+ # Constants for video processing
15
+ MAX_NUM_FRAMES = 32
16
+
17
+ # Check if CUDA is available
18
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19
+ global TOTAL_CHUNKS
20
+ TOTAL_CHUNKS = 1
21
+
22
+ # Initialize GPU if available
23
+ if DEVICE == "cuda":
24
+ def debug():
25
+ torch.randn(10).cuda()
26
+ debug()
27
+
28
+ # Model configuration
29
+ MODEL_NAME = 'iic/mPLUG-Owl3-7B-240728'
30
+ MODEL_CACHE_DIR = os.getenv('TRANSFORMERS_CACHE', './models')
31
+
32
+ # Create cache directory if it doesn't exist
33
+ os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
34
+
35
+ # Download and cache the model
36
+ try:
37
+ model_path = snapshot_download(MODEL_NAME, cache_dir=MODEL_CACHE_DIR)
38
+ except Exception as e:
39
+ print(f"Error downloading model: {str(e)}")
40
+ model_path = os.path.join(MODEL_CACHE_DIR, MODEL_NAME)
41
+
42
+
43
+ # Model configuration and existing functions remain unchanged...
44
+ def load_model_and_tokenizer():
45
+ """Load a fresh instance of the model and tokenizer"""
46
+ try:
47
+ # Clear GPU memory if using CUDA
48
+ if DEVICE == "cuda":
49
+ torch.cuda.empty_cache()
50
+ gc.collect()
51
+
52
+ model = AutoModel.from_pretrained(
53
+ model_path,
54
+ attn_implementation='sdpa',
55
+ trust_remote_code=True,
56
+ torch_dtype=torch.half,
57
+ device_map='auto'
58
+ )
59
+
60
+ tokenizer = AutoTokenizer.from_pretrained(
61
+ model_path,
62
+ trust_remote_code=True
63
+ )
64
+ model.eval()
65
+ processor = model.init_processor(tokenizer)
66
+ return model, tokenizer, processor
67
+ except Exception as e:
68
+ print(f"Error loading model: {str(e)}")
69
+ raise
70
+
71
+ def process_image(image_path, model, tokenizer, processor, prompt):
72
+ """Process single image with mPLUG model"""
73
+ try:
74
+ image = Image.open(image_path)
75
+ messages = [{
76
+ "role": "user",
77
+ "content": prompt,
78
+ "images": [image]
79
+ }]
80
+
81
+ model_messages = []
82
+ images = []
83
+
84
+ for msg in messages:
85
+ content_str = msg["content"]
86
+ if "images" in msg and msg["images"]:
87
+ content_str += "<|image|>"
88
+ images.extend(msg["images"])
89
+ model_messages.append({
90
+ "role": msg["role"],
91
+ "content": content_str
92
+ })
93
+
94
+ model_messages.append({
95
+ "role": "assistant",
96
+ "content": ""
97
+ })
98
+
99
+ inputs = processor(
100
+ model_messages,
101
+ images=images,
102
+ videos=None
103
+ )
104
+ inputs.to('cuda')
105
+ inputs.update({
106
+ 'tokenizer': tokenizer,
107
+ 'max_new_tokens': 100,
108
+ 'decode_text': True,
109
+ })
110
+
111
+ response = model.generate(**inputs)
112
+ return response[0]
113
+ except Exception as e:
114
+ print(f"Error processing image: {str(e)}")
115
+ return "Error processing image"
116
+
117
+ def process_video_chunk(video_frames, model, tokenizer, processor, prompt):
118
+ """Process a chunk of video frames with mPLUG model"""
119
+ messages = [
120
+ {
121
+ "role": "user",
122
+ "content": prompt,
123
+ "video_frames": video_frames
124
+ }
125
+ ]
126
+
127
+ model_messages = []
128
+ videos = []
129
+
130
+ for msg in messages:
131
+ content_str = msg["content"]
132
+ if "video_frames" in msg and msg["video_frames"]:
133
+ content_str += "<|video|>"
134
+ videos.append(msg["video_frames"])
135
+ model_messages.append({
136
+ "role": msg["role"],
137
+ "content": content_str
138
+ })
139
+
140
+ model_messages.append({
141
+ "role": "assistant",
142
+ "content": ""
143
+ })
144
+
145
+ inputs = processor(
146
+ model_messages,
147
+ images=None,
148
+ videos=videos if videos else None
149
+ )
150
+ inputs.to('cuda')
151
+ inputs.update({
152
+ 'tokenizer': tokenizer,
153
+ 'max_new_tokens': 100,
154
+ 'decode_text': True,
155
+ })
156
+
157
+ response = model.generate(**inputs)
158
+ return response[0]
159
+
160
+ def split_original_video(video_path, chunk_info):
161
+ """Split original video into chunks using precise timestamps"""
162
+ original_chunks = []
163
+ tmp_dir = os.path.join('/teamspace/studios/this_studio', 'tmp')
164
+
165
+ for chunk in chunk_info:
166
+ output_path = os.path.join(tmp_dir, f"original_chunk_{chunk['chunk_id']}.mp4")
167
+ # Use ffmpeg for precise splitting without re-encoding
168
+ cmd = [
169
+ 'ffmpeg',
170
+ '-ss', str(chunk['start_time']),
171
+ '-to', str(chunk['end_time']),
172
+ '-i', video_path,
173
+ '-c', 'copy',
174
+ output_path
175
+ ]
176
+ subprocess.run(cmd, check=True)
177
+ original_chunks.append(output_path)
178
+
179
+ return original_chunks
180
+
181
+ def encode_video_in_chunks(video_path):
182
+ """Extract frames from a video in chunks and save chunks to disk"""
183
+ global TOTAL_CHUNKS
184
+ vr = VideoReader(video_path, ctx=cpu(0))
185
+ original_fps = vr.get_avg_fps()
186
+ sample_fps = round(original_fps / 1) # 1 FPS
187
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
188
+ fps = vr.get_avg_fps()
189
+
190
+ # Create tmp directory if it doesn't exist
191
+ tmp_dir = os.path.join('/teamspace/studios/this_studio', 'tmp')
192
+ os.makedirs(tmp_dir, exist_ok=True)
193
+
194
+ # Split frame indices into chunks
195
+ chunks = [
196
+ frame_idx[i:i + MAX_NUM_FRAMES]
197
+ for i in range(0, len(frame_idx), MAX_NUM_FRAMES)
198
+ ]
199
+
200
+ # Set global TOTAL_CHUNKS before processing
201
+ TOTAL_CHUNKS = len(chunks)
202
+ print(f"Total chunks: {TOTAL_CHUNKS}")
203
+
204
+ # Information about saved chunks
205
+ chunk_info = []
206
+
207
+ for chunk_idx, chunk in enumerate(chunks):
208
+ # Get frames for this chunk
209
+ frames = vr.get_batch(chunk).asnumpy()
210
+ frames_pil = [Image.fromarray(v.astype('uint8')) for v in frames]
211
+
212
+ # Save chunk as a video file
213
+ chunk_path = os.path.join(tmp_dir, f"chunk_{chunk_idx}.mp4")
214
+
215
+ # Calculate start and end times for this chunk
216
+ if chunk:
217
+ start_frame = chunk[0]
218
+ end_frame = chunk[-1]
219
+ start_time = start_frame / fps
220
+ end_time = end_frame / fps
221
+
222
+ # Save chunk info for later use
223
+ chunk_info.append({
224
+ 'chunk_id': chunk_idx,
225
+ 'path': chunk_path,
226
+ 'start_time': start_time,
227
+ 'end_time': end_time,
228
+ 'start_frame': start_frame,
229
+ 'end_frame': end_frame,
230
+ 'original_fps': fps # Use actual fps from video
231
+ })
232
+
233
+ # Use OpenCV to create video from frames
234
+ height, width, _ = frames[0].shape
235
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
236
+ out = cv2.VideoWriter(chunk_path, fourcc, fps, (width, height))
237
+
238
+ for frame in frames:
239
+ # Convert RGB to BGR (OpenCV format)
240
+ frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
241
+ out.write(frame_bgr)
242
+
243
+ out.release()
244
+ print(f"Saved chunk {chunk_idx} to {chunk_path}")
245
+
246
+ yield chunk_idx, frames_pil, chunk_info[-1] if chunk_info else None
247
+
248
+ # Split original video after processing all chunks
249
+ original_chunks = split_original_video(video_path, chunk_info)
250
+
251
+ def analyze_image_activities(image_path):
252
+ """Analyze construction site image and generate activity description"""
253
+ from datetime import datetime, timedelta
254
+ try:
255
+ # Sample structured response - Replace with actual model processing
256
+ return [
257
+ {
258
+ 'time': datetime.now().strftime("%I:%M %p"),
259
+ 'summary': 'Excavation work in progress',
260
+ 'objects': ['excavator', 'worker', 'dump-truck']
261
+ },
262
+ {
263
+ 'time': (datetime.now() - timedelta(minutes=30)).strftime("%I:%M %p"),
264
+ 'summary': 'Material loading operation',
265
+ 'objects': ['loader', 'worker', 'gravel']
266
+ }
267
+ ]
268
+ except Exception as e:
269
+ print(f"Error analyzing image: {str(e)}")
270
+ return [] # Return empty list on error
271
+
272
+
273
+ def generate_thumbnails(video_path, num_chunks):
274
+ """Extract thumbnails for each chunk
275
+ Args:
276
+ video_path: Path to video file
277
+ num_chunks: Number of chunks to generate thumbnails for
278
+ """
279
+ vr = VideoReader(video_path, ctx=cpu(0))
280
+ thumbnails = []
281
+ total_frames = len(vr)
282
+
283
+ # Create/clear tmp directory in current working directory
284
+ tmp_dir = os.path.join('/teamspace/studios/this_studio', 'tmp')
285
+ # Remove existing directory if it exists
286
+ if os.path.exists(tmp_dir):
287
+ shutil.rmtree(tmp_dir)
288
+ os.makedirs(tmp_dir, exist_ok=True)
289
+
290
+ # Calculate frame step size based on number of chunks
291
+ frame_step = total_frames // num_chunks
292
+
293
+ for chunk_idx in range(num_chunks):
294
+ # Take frame at start of each chunk
295
+ frame_idx = chunk_idx * frame_step
296
+ if frame_idx < total_frames:
297
+ frame = vr[frame_idx].asnumpy()
298
+ img = Image.fromarray(frame)
299
+ temp_path = os.path.join(tmp_dir, f"thumbnail_{chunk_idx}.jpg")
300
+ img.save(temp_path)
301
+ thumbnails.append({
302
+ "path": temp_path,
303
+ "time": frame_idx/vr.get_avg_fps()
304
+ })
305
+
306
+ return thumbnails
307
+
308
+ def analyze_video_activities(video_path):
309
+ """Analyze video using mPLUG model with chunking"""
310
+ global TOTAL_CHUNKS
311
+ try:
312
+ # Existing chunk processing
313
+ all_activities = []
314
+ # Calculate total chunks first
315
+ vr = VideoReader(video_path, ctx=cpu(0))
316
+ sample_fps = round(vr.get_avg_fps() / 1)
317
+ frame_idx = [i for i in range(0, len(vr), sample_fps)]
318
+ TOTAL_CHUNKS = len([frame_idx[i:i + MAX_NUM_FRAMES]
319
+ for i in range(0, len(frame_idx), MAX_NUM_FRAMES)])
320
+
321
+ # Generate thumbnails with known chunk count
322
+ thumbnails = generate_thumbnails(video_path, num_chunks=TOTAL_CHUNKS)
323
+
324
+ # Now process chunks
325
+ chunk_generator = encode_video_in_chunks(video_path)
326
+
327
+ for chunk_idx, video_frames, chunk_info in chunk_generator:
328
+ model, tokenizer, processor = load_model_and_tokenizer()
329
+ prompt = "Analyze this construction site video chunk and describe the activities happening. Focus on construction activities, machinery usage, and worker actions. Include any construction equipment or machinery you can identify."
330
+ response = process_video_chunk(video_frames, model, tokenizer, processor, prompt)
331
+ print(f"Chunk {chunk_idx}: {response}")
332
+
333
+ # Map responses to thumbnails
334
+ time_start = chunk_idx * MAX_NUM_FRAMES
335
+ chunk_thumbnails = [t for t in thumbnails
336
+ if time_start <= t['time'] < time_start + MAX_NUM_FRAMES]
337
+
338
+ # Extract time from frame position
339
+ for thumbnail in chunk_thumbnails:
340
+ # Calculate timestamp in minutes:seconds format
341
+ seconds = int(thumbnail['time'])
342
+ minutes = seconds // 60
343
+ seconds = seconds % 60
344
+ timestamp = f"{minutes:02d}:{seconds:02d}"
345
+
346
+ # Extract objects using basic text parsing from the response
347
+ # In a production system, you might want to use more sophisticated NLP
348
+ objects = []
349
+ lower_response = response.lower()
350
+ possible_objects = ["excavator", "bulldozer", "crane", "truck", "loader",
351
+ "worker", "concrete", "scaffold", "beam", "pipe",
352
+ "rebar", "formwork", "drill", "grader", "roller"]
353
+
354
+ for obj in possible_objects:
355
+ if obj in lower_response:
356
+ objects.append(obj)
357
+
358
+ activity = {
359
+ 'time': timestamp,
360
+ 'timestamp_seconds': thumbnail['time'], # Store raw seconds for sorting
361
+ 'summary': response,
362
+ 'objects': objects,
363
+ 'thumbnail': thumbnail["path"],
364
+ 'chunk_id': chunk_idx,
365
+ 'chunk_path': chunk_info['path'] if chunk_info else None
366
+ }
367
+
368
+ all_activities.append(activity)
369
+
370
+ # Cleanup
371
+ del model, tokenizer, processor
372
+ torch.cuda.empty_cache()
373
+ gc.collect()
374
+
375
+ # Sort activities by timestamp
376
+ all_activities.sort(key=lambda x: x['timestamp_seconds'])
377
+ return all_activities
378
+ except Exception as e:
379
+ print(f"Error analyzing video: {str(e)}")
380
+ return [] # Maintain consistent return type