BahadirGLCK commited on
Commit
0ce60d0
·
1 Parent(s): 4667ca1

Upload video session

Browse files
Files changed (1) hide show
  1. app.py +25 -29
app.py CHANGED
@@ -9,15 +9,15 @@ import torch
9
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
  from qwen_vl_utils import process_vision_info
11
  import gradio as gr
12
- from pytube import YouTube # new import for handling YouTube links
13
 
14
  # ----------------------------------------
15
- # 1. Initialize the Qwen 2.5 VL Model (7B) for CPU-only
16
  # ----------------------------------------
17
  model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
18
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
19
  model_path,
20
- torch_dtype=torch.float16 # use float32 for CPU
21
  # Removed attn_implementation and device_map for CPU-only deployment
22
  )
23
  processor = AutoProcessor.from_pretrained(model_path)
@@ -25,10 +25,10 @@ processor = AutoProcessor.from_pretrained(model_path)
25
  # -------------------------------------------------
26
  # 2. Define Utility Functions for Video Processing
27
  # -------------------------------------------------
28
-
29
  def download_video(url, dest_path):
30
  """
31
  Download a non-YouTube video using requests.
 
32
  """
33
  response = requests.get(url, stream=True)
34
  with open(dest_path, 'wb') as f:
@@ -38,31 +38,25 @@ def download_video(url, dest_path):
38
 
39
  def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
40
  """
41
- Download (if needed) and extract frames and timestamps from the video.
42
- If the URL is a YouTube link, use pytube to download the video.
 
43
  Uses caching to avoid repeated processing.
44
  """
45
  os.makedirs(cache_dir, exist_ok=True)
46
  video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
47
 
48
- # Check if the video is a YouTube link
49
- if video_path.startswith('http') and ("youtube.com" in video_path or "youtu.be" in video_path):
50
- video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
51
- if not os.path.exists(video_file_path):
52
- print("Downloading YouTube video using pytube...")
53
- yt = YouTube(video_path)
54
- stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
55
- stream.download(output_path=cache_dir, filename=f'{video_hash}.mp4')
56
- # Otherwise, if it's a direct link to an mp4 or a local path
57
- elif video_path.startswith('http'):
58
  video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
59
  if not os.path.exists(video_file_path):
60
  print("Downloading video using requests...")
61
  download_video(video_path, video_file_path)
62
  else:
 
63
  video_file_path = video_path
64
 
65
- # Check if frames have been cached already
66
  frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
67
  timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
68
  if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
@@ -77,7 +71,7 @@ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
77
  frames = vr.get_batch(indices).asnumpy()
78
  timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
79
 
80
- # Save the results to cache for later re-use
81
  np.save(frames_cache_file, frames)
82
  np.save(timestamps_cache_file, timestamps)
83
 
@@ -88,8 +82,8 @@ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
88
  # --------------------------------------------------------
89
  def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
90
  """
91
- Prepare the input messages with the prompt and video metadata,
92
- process the video inputs, and run inference through the model.
93
  """
94
  messages = [
95
  {"role": "system", "content": "You are a helpful assistant."},
@@ -109,7 +103,7 @@ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 *
109
  padding=True,
110
  return_tensors="pt"
111
  )
112
- # Do not move to GPU on CPU-only environment
113
  output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
114
  generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
115
  output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
@@ -127,14 +121,16 @@ sample_prompts = [
127
  # -------------------------------------------------
128
  # 5. Main Processing Function for the Gradio Interface
129
  # -------------------------------------------------
130
- def process_video(video_url, custom_prompt, sample_prompt):
131
  """
132
- Called when the user clicks 'Process Video'. Uses the custom prompt if provided; otherwise uses the sample prompt.
133
- Downloads and processes the video then runs the inference.
 
134
  """
135
  final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
136
  try:
137
- video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
 
138
  except Exception as e:
139
  return f"Error processing video: {str(e)}"
140
 
@@ -149,11 +145,11 @@ def process_video(video_url, custom_prompt, sample_prompt):
149
  # 6. Build the Gradio Interface
150
  # -------------------------------------------------
151
  with gr.Blocks() as demo:
152
- gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B) on CPU")
153
- gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
154
 
155
  with gr.Row():
156
- video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL...", lines=1)
157
  with gr.Row():
158
  custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
159
  with gr.Row():
@@ -162,7 +158,7 @@ with gr.Blocks() as demo:
162
  output_text = gr.Textbox(label="Output", lines=10)
163
  run_button = gr.Button("Process Video")
164
 
165
- run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
166
 
167
  # -------------------------------------------------
168
  # 7. Launch the App
 
9
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
  from qwen_vl_utils import process_vision_info
11
  import gradio as gr
12
+ # Removed pytube since we no longer download from YouTube
13
 
14
  # ----------------------------------------
15
+ # 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
16
  # ----------------------------------------
17
  model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
18
  model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
19
  model_path,
20
+ torch_dtype=torch.float16 # use float16 on CPU if desired, else use float32
21
  # Removed attn_implementation and device_map for CPU-only deployment
22
  )
23
  processor = AutoProcessor.from_pretrained(model_path)
 
25
  # -------------------------------------------------
26
  # 2. Define Utility Functions for Video Processing
27
  # -------------------------------------------------
 
28
  def download_video(url, dest_path):
29
  """
30
  Download a non-YouTube video using requests.
31
+ (This function is retained if you need it later.)
32
  """
33
  response = requests.get(url, stream=True)
34
  with open(dest_path, 'wb') as f:
 
38
 
39
  def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
40
  """
41
+ Extract frames and timestamps from a video file.
42
+ If the video_path is a URL, it will download it.
43
+ For local files (including uploaded videos), it processes directly.
44
  Uses caching to avoid repeated processing.
45
  """
46
  os.makedirs(cache_dir, exist_ok=True)
47
  video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
48
 
49
+ # If video_path starts with 'http', attempt to download
50
+ if video_path.startswith('http'):
 
 
 
 
 
 
 
 
51
  video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
52
  if not os.path.exists(video_file_path):
53
  print("Downloading video using requests...")
54
  download_video(video_path, video_file_path)
55
  else:
56
+ # For local files (uploaded videos), use the provided path directly.
57
  video_file_path = video_path
58
 
59
+ # Check for cached frames
60
  frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
61
  timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
62
  if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
 
71
  frames = vr.get_batch(indices).asnumpy()
72
  timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
73
 
74
+ # Save to cache
75
  np.save(frames_cache_file, frames)
76
  np.save(timestamps_cache_file, timestamps)
77
 
 
82
  # --------------------------------------------------------
83
  def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
84
  """
85
+ Prepares the input messages with the prompt and video metadata,
86
+ processes the video inputs, and runs inference through the model.
87
  """
88
  messages = [
89
  {"role": "system", "content": "You are a helpful assistant."},
 
103
  padding=True,
104
  return_tensors="pt"
105
  )
106
+ # In CPU-only mode, we use the default device (no .to('cuda'))
107
  output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
108
  generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
109
  output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 
121
  # -------------------------------------------------
122
  # 5. Main Processing Function for the Gradio Interface
123
  # -------------------------------------------------
124
+ def process_video(video_file, custom_prompt, sample_prompt):
125
  """
126
+ Called when the user clicks 'Process Video'.
127
+ Uses the custom prompt if provided; otherwise, uses the sample prompt.
128
+ Processes the uploaded video file and runs inference.
129
  """
130
  final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
131
  try:
132
+ # video_file is expected to be a local file path from the uploader.
133
+ video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
134
  except Exception as e:
135
  return f"Error processing video: {str(e)}"
136
 
 
145
  # 6. Build the Gradio Interface
146
  # -------------------------------------------------
147
  with gr.Blocks() as demo:
148
+ gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
149
+ gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
150
 
151
  with gr.Row():
152
+ video_input = gr.Video(label="Upload Video", source="upload")
153
  with gr.Row():
154
  custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
155
  with gr.Row():
 
158
  output_text = gr.Textbox(label="Output", lines=10)
159
  run_button = gr.Button("Process Video")
160
 
161
+ run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
162
 
163
  # -------------------------------------------------
164
  # 7. Launch the App