SkyNait commited on
Commit
9c538e6
·
verified ·
1 Parent(s): f7ea386

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -72
app.py CHANGED
@@ -67,6 +67,7 @@ def extract_frames_from_video(video_path, max_frames=10):
67
 
68
  frames = []
69
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
70
 
71
  if frame_count == 0:
72
  cap.release()
@@ -82,14 +83,16 @@ def extract_frames_from_video(video_path, max_frames=10):
82
  break
83
 
84
  if frame_idx % step == 0:
85
- frames.append(frame)
 
 
86
  if len(frames) >= max_frames:
87
  break
88
 
89
  frame_idx += 1
90
 
91
  cap.release()
92
- return frames
93
 
94
  @spaces.GPU
95
  def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
@@ -168,96 +171,131 @@ def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
168
  except Exception as e:
169
  return f"Error: {str(e)}", '\n'.join(debug_msgs)
170
 
171
- @spaces.GPU
172
- def process_video_file(video_file, model_id, sys_prompt, usr_prompt, device, max_frames):
173
- """Process uploaded video file and return captions for multiple frames"""
174
- if video_file is None:
175
- return "No video file uploaded", ""
176
-
177
  debug_msgs = []
178
- temp_files = [] # Track temporary files for cleanup
179
 
180
  try:
 
181
  update_model(model_id, device)
182
  processor = model_cache['processor']
183
  model = model_cache['model']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  # Extract frames from video
186
  t0 = time.time()
187
- frames = extract_frames_from_video(video_file, max_frames)
188
- debug_msgs.append(f'Extracted {len(frames)} frames in {int((time.time()-t0)*1000)} ms')
 
189
 
190
- if not frames:
191
  return "No frames could be extracted from the video", '\n'.join(debug_msgs)
192
 
193
- captions = []
194
-
195
- for i, frame in enumerate(frames):
196
- # Preprocess frame
197
- t1 = time.time()
198
- rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
199
- pil_img = Image.fromarray(rgb)
200
- temp_path = f'frame_{i}.jpg'
201
- temp_files.append(temp_path) # Track for cleanup
202
- pil_img.save(temp_path, format='JPEG', quality=50)
203
 
204
- # Prepare multimodal chat messages
205
- messages = [
206
- {'role': 'system', 'content': [{'type': 'text', 'text': sys_prompt}]},
207
- {'role': 'user', 'content': [
208
- {'type': 'image', 'url': temp_path},
209
- {'type': 'text', 'text': usr_prompt}
210
- ]}
211
- ]
212
-
213
- # Tokenize and encode
214
- inputs = processor.apply_chat_template(
215
- messages,
216
- add_generation_prompt=True,
217
- tokenize=True,
218
- return_dict=True,
219
- return_tensors='pt'
220
  )
221
 
222
- # Move inputs to correct device and dtype
223
- param_dtype = next(model.parameters()).dtype
224
- cast_inputs = {}
225
- for k, v in inputs.items():
226
- if isinstance(v, torch.Tensor):
227
- if v.dtype.is_floating_point:
228
- cast_inputs[k] = v.to(device=model.device, dtype=param_dtype)
229
- else:
230
- cast_inputs[k] = v.to(device=model.device)
231
- else:
232
- cast_inputs[k] = v
233
- inputs = cast_inputs
234
-
235
- # Inference
236
- outputs = model.generate(**inputs, do_sample=False, max_new_tokens=128)
237
 
238
- # Decode and strip history
239
- raw = processor.batch_decode(outputs, skip_special_tokens=True)[0]
240
- if "Assistant:" in raw:
241
- caption = raw.split("Assistant:")[-1].strip()
242
  else:
243
- lines = raw.splitlines()
244
- caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
245
 
246
- captions.append(f"Frame {i+1}: {caption}")
247
- debug_msgs.append(f'Frame {i+1} processed in {int((time.time()-t1)*1000)} ms')
248
 
249
- return '\n\n'.join(captions), '\n'.join(debug_msgs)
250
 
251
  except Exception as e:
252
  return f"Error processing video: {str(e)}", '\n'.join(debug_msgs)
253
- finally:
254
- # Clean up all temporary files
255
- for temp_file in temp_files:
256
- if os.path.exists(temp_file):
257
- try:
258
- os.remove(temp_file)
259
- except Exception as cleanup_error:
260
- logging.warning(f"Failed to cleanup {temp_file}: {cleanup_error}")
261
 
262
  def toggle_input_mode(input_mode):
263
  """Toggle between webcam and video file input"""
@@ -303,6 +341,7 @@ def main():
303
 
304
  # Video file-specific controls
305
  with gr.Row(visible=False) as video_controls:
 
306
  max_frames = gr.Slider(1, 20, step=1, value=5, label='Max Frames to Process')
307
 
308
  sys_p = gr.Textbox(lines=2, value='Describe the key action', label='System Prompt')
@@ -347,8 +386,8 @@ def main():
347
 
348
  # Video file processing
349
  process_btn.click(
350
- fn=process_video_file,
351
- inputs=[video_file, model_dd, sys_p, usr_p, device_dd, max_frames],
352
  outputs=[caption_tb, log_tb]
353
  )
354
 
 
67
 
68
  frames = []
69
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
70
+ fps = cap.get(cv2.CAP_PROP_FPS)
71
 
72
  if frame_count == 0:
73
  cap.release()
 
83
  break
84
 
85
  if frame_idx % step == 0:
86
+ # Calculate timestamp for this frame
87
+ timestamp = frame_idx / fps if fps > 0 else frame_idx
88
+ frames.append((frame, timestamp))
89
  if len(frames) >= max_frames:
90
  break
91
 
92
  frame_idx += 1
93
 
94
  cap.release()
95
+ return frames, fps
96
 
97
  @spaces.GPU
98
  def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
 
171
  except Exception as e:
172
  return f"Error: {str(e)}", '\n'.join(debug_msgs)
173
 
174
+ def process_single_frame(frame, model_id, sys_prompt, usr_prompt, device, frame_id=0):
175
+ """Process a single frame similar to webcam mode - optimized for reuse"""
 
 
 
 
176
  debug_msgs = []
177
+ temp_path = None
178
 
179
  try:
180
+ # Ensure model is loaded
181
  update_model(model_id, device)
182
  processor = model_cache['processor']
183
  model = model_cache['model']
184
+
185
+ # Preprocess frame
186
+ t0 = time.time()
187
+ rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
188
+ pil_img = Image.fromarray(rgb)
189
+ temp_path = f'video_frame_{frame_id}.jpg'
190
+ pil_img.save(temp_path, format='JPEG', quality=50)
191
+ debug_msgs.append(f'Preprocess: {int((time.time()-t0)*1000)} ms')
192
+
193
+ # Prepare multimodal chat messages
194
+ messages = [
195
+ {'role': 'system', 'content': [{'type': 'text', 'text': sys_prompt}]},
196
+ {'role': 'user', 'content': [
197
+ {'type': 'image', 'url': temp_path},
198
+ {'type': 'text', 'text': usr_prompt}
199
+ ]}
200
+ ]
201
+
202
+ # Tokenize and encode
203
+ t1 = time.time()
204
+ inputs = processor.apply_chat_template(
205
+ messages,
206
+ add_generation_prompt=True,
207
+ tokenize=True,
208
+ return_dict=True,
209
+ return_tensors='pt'
210
+ )
211
 
212
+ # Move inputs to correct device and dtype (matching model parameters)
213
+ param_dtype = next(model.parameters()).dtype
214
+ cast_inputs = {}
215
+ for k, v in inputs.items():
216
+ if isinstance(v, torch.Tensor):
217
+ if v.dtype.is_floating_point:
218
+ cast_inputs[k] = v.to(device=model.device, dtype=param_dtype)
219
+ else:
220
+ cast_inputs[k] = v.to(device=model.device)
221
+ else:
222
+ cast_inputs[k] = v
223
+ inputs = cast_inputs
224
+ debug_msgs.append(f'Tokenize: {int((time.time()-t1)*1000)} ms')
225
+
226
+ # Inference
227
+ t2 = time.time()
228
+ outputs = model.generate(**inputs, do_sample=False, max_new_tokens=128)
229
+ debug_msgs.append(f'Inference: {int((time.time()-t2)*1000)} ms')
230
+
231
+ # Decode and strip history
232
+ t3 = time.time()
233
+ raw = processor.batch_decode(outputs, skip_special_tokens=True)[0]
234
+ debug_msgs.append(f'Decode: {int((time.time()-t3)*1000)} ms')
235
+
236
+ if "Assistant:" in raw:
237
+ caption = raw.split("Assistant:")[-1].strip()
238
+ else:
239
+ lines = raw.splitlines()
240
+ caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
241
+
242
+ return caption, debug_msgs, None
243
+
244
+ except Exception as e:
245
+ return f"Error: {str(e)}", debug_msgs, str(e)
246
+ finally:
247
+ # Clean up temp file
248
+ if temp_path and os.path.exists(temp_path):
249
+ try:
250
+ os.remove(temp_path)
251
+ except Exception as cleanup_error:
252
+ logging.warning(f"Failed to cleanup {temp_path}: {cleanup_error}")
253
+
254
+ @spaces.GPU
255
+ def process_video_with_interval(video_file, model_id, sys_prompt, usr_prompt, device, max_frames, interval_ms):
256
+ """Process video file with interval-based processing similar to webcam mode"""
257
+ if video_file is None:
258
+ return "No video file uploaded", ""
259
+
260
+ debug_msgs = []
261
+ all_captions = []
262
+
263
+ try:
264
  # Extract frames from video
265
  t0 = time.time()
266
+ frames_with_timestamps, fps = extract_frames_from_video(video_file, max_frames)
267
+ debug_msgs.append(f'Extracted {len(frames_with_timestamps)} frames in {int((time.time()-t0)*1000)} ms')
268
+ debug_msgs.append(f'Video FPS: {fps:.2f}')
269
 
270
+ if not frames_with_timestamps:
271
  return "No frames could be extracted from the video", '\n'.join(debug_msgs)
272
 
273
+ # Process each frame with interval delay (similar to webcam mode)
274
+ for i, (frame, timestamp) in enumerate(frames_with_timestamps):
275
+ # Apply interval delay (similar to webcam mode)
276
+ if i > 0: # Don't delay the first frame
277
+ time.sleep(interval_ms / 1000)
 
 
 
 
 
278
 
279
+ # Process frame using the same logic as webcam mode
280
+ caption, frame_debug_msgs, error = process_single_frame(
281
+ frame, model_id, sys_prompt, usr_prompt, device, frame_id=i
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  )
283
 
284
+ # Add timing information
285
+ timestamp_str = f"{timestamp:.2f}s"
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
+ if error:
288
+ all_captions.append(f"Frame {i+1} (t={timestamp_str}): ERROR - {error}")
 
 
289
  else:
290
+ all_captions.append(f"Frame {i+1} (t={timestamp_str}): {caption}")
 
291
 
292
+ # Add frame-specific debug info
293
+ debug_msgs.extend([f"Frame {i+1}: {msg}" for msg in frame_debug_msgs])
294
 
295
+ return '\n\n'.join(all_captions), '\n'.join(debug_msgs)
296
 
297
  except Exception as e:
298
  return f"Error processing video: {str(e)}", '\n'.join(debug_msgs)
 
 
 
 
 
 
 
 
299
 
300
  def toggle_input_mode(input_mode):
301
  """Toggle between webcam and video file input"""
 
341
 
342
  # Video file-specific controls
343
  with gr.Row(visible=False) as video_controls:
344
+ interval_video = gr.Slider(100, 10000, step=100, value=1000, label='Processing Interval (ms)')
345
  max_frames = gr.Slider(1, 20, step=1, value=5, label='Max Frames to Process')
346
 
347
  sys_p = gr.Textbox(lines=2, value='Describe the key action', label='System Prompt')
 
386
 
387
  # Video file processing
388
  process_btn.click(
389
+ fn=process_video_with_interval,
390
+ inputs=[video_file, model_dd, sys_p, usr_p, device_dd, max_frames, interval_video],
391
  outputs=[caption_tb, log_tb]
392
  )
393