Spaces:

BahadirGLCK
/

video_splitter

Running

App Files Files Community

BahadirGLCK commited on Feb 22

Commit

0ce60d0

1 Parent(s): 4667ca1

Upload video session

Browse files

Files changed (1) hide show

app.py +25 -29

app.py CHANGED Viewed

@@ -9,15 +9,15 @@ import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import gradio as gr
-from pytube import YouTube  # new import for handling YouTube links
 # ----------------------------------------
-# 1. Initialize the Qwen 2.5 VL Model (7B) for CPU-only
 # ----------------------------------------
 model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     model_path,
-    torch_dtype=torch.float16  # use float32 for CPU
     # Removed attn_implementation and device_map for CPU-only deployment
 )
 processor = AutoProcessor.from_pretrained(model_path)
@@ -25,10 +25,10 @@ processor = AutoProcessor.from_pretrained(model_path)
 # -------------------------------------------------
 # 2. Define Utility Functions for Video Processing
 # -------------------------------------------------
 def download_video(url, dest_path):
     """
     Download a non-YouTube video using requests.
     """
     response = requests.get(url, stream=True)
     with open(dest_path, 'wb') as f:
@@ -38,31 +38,25 @@ def download_video(url, dest_path):
 def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
     """
-    Download (if needed) and extract frames and timestamps from the video.
-    If the URL is a YouTube link, use pytube to download the video.
     Uses caching to avoid repeated processing.
     """
     os.makedirs(cache_dir, exist_ok=True)
     video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
-    # Check if the video is a YouTube link
-    if video_path.startswith('http') and ("youtube.com" in video_path or "youtu.be" in video_path):
-        video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
-        if not os.path.exists(video_file_path):
-            print("Downloading YouTube video using pytube...")
-            yt = YouTube(video_path)
-            stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
-            stream.download(output_path=cache_dir, filename=f'{video_hash}.mp4')
-    # Otherwise, if it's a direct link to an mp4 or a local path
-    elif video_path.startswith('http'):
         video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
         if not os.path.exists(video_file_path):
             print("Downloading video using requests...")
             download_video(video_path, video_file_path)
     else:
         video_file_path = video_path
-    # Check if frames have been cached already
     frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
     timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
     if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
@@ -77,7 +71,7 @@ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
     frames = vr.get_batch(indices).asnumpy()
     timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
-    # Save the results to cache for later re-use
     np.save(frames_cache_file, frames)
     np.save(timestamps_cache_file, timestamps)
@@ -88,8 +82,8 @@ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
 # --------------------------------------------------------
 def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
     """
-    Prepare the input messages with the prompt and video metadata,
-    process the video inputs, and run inference through the model.
     """
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
@@ -109,7 +103,7 @@ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 *
         padding=True,
         return_tensors="pt"
     )
-    # Do not move to GPU on CPU-only environment
     output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
     generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
     output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
@@ -127,14 +121,16 @@ sample_prompts = [
 # -------------------------------------------------
 # 5. Main Processing Function for the Gradio Interface
 # -------------------------------------------------
-def process_video(video_url, custom_prompt, sample_prompt):
     """
-    Called when the user clicks 'Process Video'. Uses the custom prompt if provided; otherwise uses the sample prompt.
-    Downloads and processes the video then runs the inference.
     """
     final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
     try:
-        video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
     except Exception as e:
         return f"Error processing video: {str(e)}"
@@ -149,11 +145,11 @@ def process_video(video_url, custom_prompt, sample_prompt):
 # 6. Build the Gradio Interface
 # -------------------------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B) on CPU")
-    gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
     with gr.Row():
-        video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL...", lines=1)
     with gr.Row():
         custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
     with gr.Row():
@@ -162,7 +158,7 @@ with gr.Blocks() as demo:
     output_text = gr.Textbox(label="Output", lines=10)
     run_button = gr.Button("Process Video")
-    run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
 # -------------------------------------------------
 # 7. Launch the App

 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import gradio as gr
+# Removed pytube since we no longer download from YouTube
 # ----------------------------------------
+# 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
 # ----------------------------------------
 model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     model_path,
+    torch_dtype=torch.float16  # use float16 on CPU if desired, else use float32
     # Removed attn_implementation and device_map for CPU-only deployment
 )
 processor = AutoProcessor.from_pretrained(model_path)
 # -------------------------------------------------
 # 2. Define Utility Functions for Video Processing
 # -------------------------------------------------
 def download_video(url, dest_path):
     """
     Download a non-YouTube video using requests.
+    (This function is retained if you need it later.)
     """
     response = requests.get(url, stream=True)
     with open(dest_path, 'wb') as f:
 def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
     """
+    Extract frames and timestamps from a video file.
+    If the video_path is a URL, it will download it.
+    For local files (including uploaded videos), it processes directly.
     Uses caching to avoid repeated processing.
     """
     os.makedirs(cache_dir, exist_ok=True)
     video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
+    # If video_path starts with 'http', attempt to download
+    if video_path.startswith('http'):
         video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
         if not os.path.exists(video_file_path):
             print("Downloading video using requests...")
             download_video(video_path, video_file_path)
     else:
+        # For local files (uploaded videos), use the provided path directly.
         video_file_path = video_path
+    # Check for cached frames
     frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
     timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
     if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
     frames = vr.get_batch(indices).asnumpy()
     timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
+    # Save to cache
     np.save(frames_cache_file, frames)
     np.save(timestamps_cache_file, timestamps)
 # --------------------------------------------------------
 def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
     """
+    Prepares the input messages with the prompt and video metadata,
+    processes the video inputs, and runs inference through the model.
     """
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
         padding=True,
         return_tensors="pt"
     )
+    # In CPU-only mode, we use the default device (no .to('cuda'))
     output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
     generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
     output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 # -------------------------------------------------
 # 5. Main Processing Function for the Gradio Interface
 # -------------------------------------------------
+def process_video(video_file, custom_prompt, sample_prompt):
     """
+    Called when the user clicks 'Process Video'.
+    Uses the custom prompt if provided; otherwise, uses the sample prompt.
+    Processes the uploaded video file and runs inference.
     """
     final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
     try:
+        # video_file is expected to be a local file path from the uploader.
+        video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
     except Exception as e:
         return f"Error processing video: {str(e)}"
 # 6. Build the Gradio Interface
 # -------------------------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
+    gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
     with gr.Row():
+        video_input = gr.Video(label="Upload Video", source="upload")
     with gr.Row():
         custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
     with gr.Row():
     output_text = gr.Textbox(label="Output", lines=10)
     run_button = gr.Button("Process Video")
+    run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
 # -------------------------------------------------
 # 7. Launch the App