Spaces:

BahadirGLCK
/

video_splitter

Running

App Files Files Community

BahadirGLCK commited on Feb 22

Commit

226eb77

1 Parent(s): 753ac1a

Appropriate for youtube original link format.

Browse files

Files changed (2) hide show

app.py +24 -21
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -9,14 +9,16 @@ import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import gradio as gr
 # ----------------------------------------
-# 1. Initialize the Qwen 2.5 VL Model (7B)
 # ----------------------------------------
-# We load the official 7B version, using flash attention optimization and bfloat16 for efficiency.
 model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    model_path
 )
 processor = AutoProcessor.from_pretrained(model_path)
@@ -26,7 +28,7 @@ processor = AutoProcessor.from_pretrained(model_path)
 def download_video(url, dest_path):
     """
-    Download the video from the given URL and save it to a destination path.
     """
     response = requests.get(url, stream=True)
     with open(dest_path, 'wb') as f:
@@ -37,16 +39,25 @@ def download_video(url, dest_path):
 def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
     """
     Download (if needed) and extract frames and timestamps from the video.
-    - Uses caching to avoid repeated processing.
-    - Utilizes decord to read video frames.
     """
     os.makedirs(cache_dir, exist_ok=True)
     video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
-    # If the video is a URL, download it locally
-    if video_path.startswith('http://') or video_path.startswith('https://'):
         video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
         if not os.path.exists(video_file_path):
             download_video(video_path, video_file_path)
     else:
         video_file_path = video_path
@@ -87,9 +98,7 @@ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 *
             {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
         ]},
     ]
-    # Prepare the text with the chat template from the processor.
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Process the video information into the proper inputs.
     image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
     fps_inputs = video_kwargs['fps']
     inputs = processor(
@@ -100,11 +109,8 @@ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 *
         padding=True,
         return_tensors="pt"
     )
-    #inputs = inputs.to('cuda')
-    # Generate the response using the model
     output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
-    # Post-process the output tokens to text.
     generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
     output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
     return output_text[0]
@@ -123,11 +129,9 @@ sample_prompts = [
 # -------------------------------------------------
 def process_video(video_url, custom_prompt, sample_prompt):
     """
-    This function is called when a user clicks the 'Process Video' button.
-    - It uses the custom prompt if provided; otherwise, it falls back to the selected sample prompt.
-    - It then downloads and processes the video and calls the inference function.
     """
-    # Choose the prompt: use custom prompt if not empty, else use the sample
     final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
     try:
         video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
@@ -145,7 +149,7 @@ def process_video(video_url, custom_prompt, sample_prompt):
 # 6. Build the Gradio Interface
 # -------------------------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B)")
     gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
     with gr.Row():
@@ -158,7 +162,6 @@ with gr.Blocks() as demo:
     output_text = gr.Textbox(label="Output", lines=10)
     run_button = gr.Button("Process Video")
-    # When the button is clicked, run the process_video function.
     run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
 # -------------------------------------------------

 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import gradio as gr
+from pytube import YouTube  # new import for handling YouTube links
 # ----------------------------------------
+# 1. Initialize the Qwen 2.5 VL Model (7B) for CPU-only
 # ----------------------------------------
 model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch.float16  # use float32 for CPU
+    # Removed attn_implementation and device_map for CPU-only deployment
 )
 processor = AutoProcessor.from_pretrained(model_path)
 def download_video(url, dest_path):
     """
+    Download a non-YouTube video using requests.
     """
     response = requests.get(url, stream=True)
     with open(dest_path, 'wb') as f:
 def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
     """
     Download (if needed) and extract frames and timestamps from the video.
+    If the URL is a YouTube link, use pytube to download the video.
+    Uses caching to avoid repeated processing.
     """
     os.makedirs(cache_dir, exist_ok=True)
     video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
+    # Check if the video is a YouTube link
+    if video_path.startswith('http') and ("youtube.com" in video_path or "youtu.be" in video_path):
         video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
         if not os.path.exists(video_file_path):
+            print("Downloading YouTube video using pytube...")
+            yt = YouTube(video_path)
+            stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
+            stream.download(output_path=cache_dir, filename=f'{video_hash}.mp4')
+    # Otherwise, if it's a direct link to an mp4 or a local path
+    elif video_path.startswith('http'):
+        video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
+        if not os.path.exists(video_file_path):
+            print("Downloading video using requests...")
             download_video(video_path, video_file_path)
     else:
         video_file_path = video_path
             {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
         ]},
     ]
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
     fps_inputs = video_kwargs['fps']
     inputs = processor(
         padding=True,
         return_tensors="pt"
     )
+    # Do not move to GPU on CPU-only environment
     output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
     generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
     output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
     return output_text[0]
 # -------------------------------------------------
 def process_video(video_url, custom_prompt, sample_prompt):
     """
+    Called when the user clicks 'Process Video'. Uses the custom prompt if provided; otherwise uses the sample prompt.
+    Downloads and processes the video then runs the inference.
     """
     final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
     try:
         video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
 # 6. Build the Gradio Interface
 # -------------------------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B) on CPU")
     gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
     with gr.Row():
     output_text = gr.Textbox(label="Output", lines=10)
     run_button = gr.Button("Process Video")
     run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
 # -------------------------------------------------

requirements.txt CHANGED Viewed

@@ -16,4 +16,5 @@ av
 decord
 numpy
 Pillow
-requests

 decord
 numpy
 Pillow
+requests
+pytube