SmolVLM2-on-transformers

Running

App Files Files Community

SkyNait commited on Jul 9

Commit

9c538e6

verified ·

1 Parent(s): f7ea386

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -72

app.py CHANGED Viewed

@@ -67,6 +67,7 @@ def extract_frames_from_video(video_path, max_frames=10):
     frames = []
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     if frame_count == 0:
         cap.release()
@@ -82,14 +83,16 @@ def extract_frames_from_video(video_path, max_frames=10):
             break
         if frame_idx % step == 0:
-            frames.append(frame)
             if len(frames) >= max_frames:
                 break
         frame_idx += 1
     cap.release()
-    return frames
 @spaces.GPU
 def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
@@ -168,96 +171,131 @@ def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
     except Exception as e:
         return f"Error: {str(e)}", '\n'.join(debug_msgs)
-@spaces.GPU
-def process_video_file(video_file, model_id, sys_prompt, usr_prompt, device, max_frames):
-    """Process uploaded video file and return captions for multiple frames"""
-    if video_file is None:
-        return "No video file uploaded", ""
     debug_msgs = []
-    temp_files = []  # Track temporary files for cleanup
     try:
         update_model(model_id, device)
         processor = model_cache['processor']
         model = model_cache['model']
         # Extract frames from video
         t0 = time.time()
-        frames = extract_frames_from_video(video_file, max_frames)
-        debug_msgs.append(f'Extracted {len(frames)} frames in {int((time.time()-t0)*1000)} ms')
-        if not frames:
             return "No frames could be extracted from the video", '\n'.join(debug_msgs)
-        captions = []
-        for i, frame in enumerate(frames):
-            # Preprocess frame
-            t1 = time.time()
-            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            pil_img = Image.fromarray(rgb)
-            temp_path = f'frame_{i}.jpg'
-            temp_files.append(temp_path)  # Track for cleanup
-            pil_img.save(temp_path, format='JPEG', quality=50)
-            # Prepare multimodal chat messages
-            messages = [
-                {'role': 'system', 'content': [{'type': 'text', 'text': sys_prompt}]},
-                {'role': 'user', 'content': [
-                    {'type': 'image', 'url': temp_path},
-                    {'type': 'text', 'text': usr_prompt}
-                ]}
-            ]
-            # Tokenize and encode
-            inputs = processor.apply_chat_template(
-                messages,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
-                return_tensors='pt'
             )
-            # Move inputs to correct device and dtype
-            param_dtype = next(model.parameters()).dtype
-            cast_inputs = {}
-            for k, v in inputs.items():
-                if isinstance(v, torch.Tensor):
-                    if v.dtype.is_floating_point:
-                        cast_inputs[k] = v.to(device=model.device, dtype=param_dtype)
-                    else:
-                        cast_inputs[k] = v.to(device=model.device)
-                else:
-                    cast_inputs[k] = v
-            inputs = cast_inputs
-            # Inference
-            outputs = model.generate(**inputs, do_sample=False, max_new_tokens=128)
-            # Decode and strip history
-            raw = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-            if "Assistant:" in raw:
-                caption = raw.split("Assistant:")[-1].strip()
             else:
-                lines = raw.splitlines()
-                caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
-            captions.append(f"Frame {i+1}: {caption}")
-            debug_msgs.append(f'Frame {i+1} processed in {int((time.time()-t1)*1000)} ms')
-        return '\n\n'.join(captions), '\n'.join(debug_msgs)
     except Exception as e:
         return f"Error processing video: {str(e)}", '\n'.join(debug_msgs)
-    finally:
-        # Clean up all temporary files
-        for temp_file in temp_files:
-            if os.path.exists(temp_file):
-                try:
-                    os.remove(temp_file)
-                except Exception as cleanup_error:
-                    logging.warning(f"Failed to cleanup {temp_file}: {cleanup_error}")
 def toggle_input_mode(input_mode):
     """Toggle between webcam and video file input"""
@@ -303,6 +341,7 @@ def main():
         # Video file-specific controls
         with gr.Row(visible=False) as video_controls:
             max_frames = gr.Slider(1, 20, step=1, value=5, label='Max Frames to Process')
         sys_p = gr.Textbox(lines=2, value='Describe the key action', label='System Prompt')
@@ -347,8 +386,8 @@ def main():
         # Video file processing
         process_btn.click(
-            fn=process_video_file,
-            inputs=[video_file, model_dd, sys_p, usr_p, device_dd, max_frames],
             outputs=[caption_tb, log_tb]
         )

     frames = []
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
     if frame_count == 0:
         cap.release()
             break
         if frame_idx % step == 0:
+            # Calculate timestamp for this frame
+            timestamp = frame_idx / fps if fps > 0 else frame_idx
+            frames.append((frame, timestamp))
             if len(frames) >= max_frames:
                 break
         frame_idx += 1
     cap.release()
+    return frames, fps
 @spaces.GPU
 def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
     except Exception as e:
         return f"Error: {str(e)}", '\n'.join(debug_msgs)
+def process_single_frame(frame, model_id, sys_prompt, usr_prompt, device, frame_id=0):
+    """Process a single frame similar to webcam mode - optimized for reuse"""
     debug_msgs = []
+    temp_path = None
     try:
+        # Ensure model is loaded
         update_model(model_id, device)
         processor = model_cache['processor']
         model = model_cache['model']
+        # Preprocess frame
+        t0 = time.time()
+        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        pil_img = Image.fromarray(rgb)
+        temp_path = f'video_frame_{frame_id}.jpg'
+        pil_img.save(temp_path, format='JPEG', quality=50)
+        debug_msgs.append(f'Preprocess: {int((time.time()-t0)*1000)} ms')
+        # Prepare multimodal chat messages
+        messages = [
+            {'role': 'system', 'content': [{'type': 'text', 'text': sys_prompt}]},
+            {'role': 'user', 'content': [
+                {'type': 'image', 'url': temp_path},
+                {'type': 'text', 'text': usr_prompt}
+            ]}
+        ]
+        # Tokenize and encode
+        t1 = time.time()
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors='pt'
+        )
+        # Move inputs to correct device and dtype (matching model parameters)
+        param_dtype = next(model.parameters()).dtype
+        cast_inputs = {}
+        for k, v in inputs.items():
+            if isinstance(v, torch.Tensor):
+                if v.dtype.is_floating_point:
+                    cast_inputs[k] = v.to(device=model.device, dtype=param_dtype)
+                else:
+                    cast_inputs[k] = v.to(device=model.device)
+            else:
+                cast_inputs[k] = v
+        inputs = cast_inputs
+        debug_msgs.append(f'Tokenize: {int((time.time()-t1)*1000)} ms')
+        # Inference
+        t2 = time.time()
+        outputs = model.generate(**inputs, do_sample=False, max_new_tokens=128)
+        debug_msgs.append(f'Inference: {int((time.time()-t2)*1000)} ms')
+        # Decode and strip history
+        t3 = time.time()
+        raw = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        debug_msgs.append(f'Decode: {int((time.time()-t3)*1000)} ms')
+        if "Assistant:" in raw:
+            caption = raw.split("Assistant:")[-1].strip()
+        else:
+            lines = raw.splitlines()
+            caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
+        return caption, debug_msgs, None
+    except Exception as e:
+        return f"Error: {str(e)}", debug_msgs, str(e)
+    finally:
+        # Clean up temp file
+        if temp_path and os.path.exists(temp_path):
+            try:
+                os.remove(temp_path)
+            except Exception as cleanup_error:
+                logging.warning(f"Failed to cleanup {temp_path}: {cleanup_error}")
+@spaces.GPU
+def process_video_with_interval(video_file, model_id, sys_prompt, usr_prompt, device, max_frames, interval_ms):
+    """Process video file with interval-based processing similar to webcam mode"""
+    if video_file is None:
+        return "No video file uploaded", ""
+    debug_msgs = []
+    all_captions = []
+    try:
         # Extract frames from video
         t0 = time.time()
+        frames_with_timestamps, fps = extract_frames_from_video(video_file, max_frames)
+        debug_msgs.append(f'Extracted {len(frames_with_timestamps)} frames in {int((time.time()-t0)*1000)} ms')
+        debug_msgs.append(f'Video FPS: {fps:.2f}')
+        if not frames_with_timestamps:
             return "No frames could be extracted from the video", '\n'.join(debug_msgs)
+        # Process each frame with interval delay (similar to webcam mode)
+        for i, (frame, timestamp) in enumerate(frames_with_timestamps):
+            # Apply interval delay (similar to webcam mode)
+            if i > 0:  # Don't delay the first frame
+                time.sleep(interval_ms / 1000)
+            # Process frame using the same logic as webcam mode
+            caption, frame_debug_msgs, error = process_single_frame(
+                frame, model_id, sys_prompt, usr_prompt, device, frame_id=i
             )
+            # Add timing information
+            timestamp_str = f"{timestamp:.2f}s"
+            if error:
+                all_captions.append(f"Frame {i+1} (t={timestamp_str}): ERROR - {error}")
             else:
+                all_captions.append(f"Frame {i+1} (t={timestamp_str}): {caption}")
+            # Add frame-specific debug info
+            debug_msgs.extend([f"Frame {i+1}: {msg}" for msg in frame_debug_msgs])
+        return '\n\n'.join(all_captions), '\n'.join(debug_msgs)
     except Exception as e:
         return f"Error processing video: {str(e)}", '\n'.join(debug_msgs)
 def toggle_input_mode(input_mode):
     """Toggle between webcam and video file input"""
         # Video file-specific controls
         with gr.Row(visible=False) as video_controls:
+            interval_video = gr.Slider(100, 10000, step=100, value=1000, label='Processing Interval (ms)')
             max_frames = gr.Slider(1, 20, step=1, value=5, label='Max Frames to Process')
         sys_p = gr.Textbox(lines=2, value='Describe the key action', label='System Prompt')
         # Video file processing
         process_btn.click(
+            fn=process_video_with_interval,
+            inputs=[video_file, model_dd, sys_p, usr_p, device_dd, max_frames, interval_video],
             outputs=[caption_tb, log_tb]
         )