Spaces:
Running
Running
Commit
·
0ce60d0
1
Parent(s):
4667ca1
Upload video session
Browse files
app.py
CHANGED
@@ -9,15 +9,15 @@ import torch
|
|
9 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
from qwen_vl_utils import process_vision_info
|
11 |
import gradio as gr
|
12 |
-
|
13 |
|
14 |
# ----------------------------------------
|
15 |
-
# 1. Initialize the Qwen 2.5 VL Model (
|
16 |
# ----------------------------------------
|
17 |
model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
|
18 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
19 |
model_path,
|
20 |
-
torch_dtype=torch.float16 # use
|
21 |
# Removed attn_implementation and device_map for CPU-only deployment
|
22 |
)
|
23 |
processor = AutoProcessor.from_pretrained(model_path)
|
@@ -25,10 +25,10 @@ processor = AutoProcessor.from_pretrained(model_path)
|
|
25 |
# -------------------------------------------------
|
26 |
# 2. Define Utility Functions for Video Processing
|
27 |
# -------------------------------------------------
|
28 |
-
|
29 |
def download_video(url, dest_path):
|
30 |
"""
|
31 |
Download a non-YouTube video using requests.
|
|
|
32 |
"""
|
33 |
response = requests.get(url, stream=True)
|
34 |
with open(dest_path, 'wb') as f:
|
@@ -38,31 +38,25 @@ def download_video(url, dest_path):
|
|
38 |
|
39 |
def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
40 |
"""
|
41 |
-
|
42 |
-
If the
|
|
|
43 |
Uses caching to avoid repeated processing.
|
44 |
"""
|
45 |
os.makedirs(cache_dir, exist_ok=True)
|
46 |
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
47 |
|
48 |
-
#
|
49 |
-
if video_path.startswith('http')
|
50 |
-
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
51 |
-
if not os.path.exists(video_file_path):
|
52 |
-
print("Downloading YouTube video using pytube...")
|
53 |
-
yt = YouTube(video_path)
|
54 |
-
stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
|
55 |
-
stream.download(output_path=cache_dir, filename=f'{video_hash}.mp4')
|
56 |
-
# Otherwise, if it's a direct link to an mp4 or a local path
|
57 |
-
elif video_path.startswith('http'):
|
58 |
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
59 |
if not os.path.exists(video_file_path):
|
60 |
print("Downloading video using requests...")
|
61 |
download_video(video_path, video_file_path)
|
62 |
else:
|
|
|
63 |
video_file_path = video_path
|
64 |
|
65 |
-
# Check
|
66 |
frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
|
67 |
timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
|
68 |
if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
|
@@ -77,7 +71,7 @@ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
|
77 |
frames = vr.get_batch(indices).asnumpy()
|
78 |
timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
|
79 |
|
80 |
-
# Save
|
81 |
np.save(frames_cache_file, frames)
|
82 |
np.save(timestamps_cache_file, timestamps)
|
83 |
|
@@ -88,8 +82,8 @@ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
|
88 |
# --------------------------------------------------------
|
89 |
def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
|
90 |
"""
|
91 |
-
|
92 |
-
|
93 |
"""
|
94 |
messages = [
|
95 |
{"role": "system", "content": "You are a helpful assistant."},
|
@@ -109,7 +103,7 @@ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 *
|
|
109 |
padding=True,
|
110 |
return_tensors="pt"
|
111 |
)
|
112 |
-
#
|
113 |
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
114 |
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
115 |
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
@@ -127,14 +121,16 @@ sample_prompts = [
|
|
127 |
# -------------------------------------------------
|
128 |
# 5. Main Processing Function for the Gradio Interface
|
129 |
# -------------------------------------------------
|
130 |
-
def process_video(
|
131 |
"""
|
132 |
-
Called when the user clicks 'Process Video'.
|
133 |
-
|
|
|
134 |
"""
|
135 |
final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
|
136 |
try:
|
137 |
-
|
|
|
138 |
except Exception as e:
|
139 |
return f"Error processing video: {str(e)}"
|
140 |
|
@@ -149,11 +145,11 @@ def process_video(video_url, custom_prompt, sample_prompt):
|
|
149 |
# 6. Build the Gradio Interface
|
150 |
# -------------------------------------------------
|
151 |
with gr.Blocks() as demo:
|
152 |
-
gr.Markdown("#
|
153 |
-
gr.Markdown("
|
154 |
|
155 |
with gr.Row():
|
156 |
-
|
157 |
with gr.Row():
|
158 |
custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
|
159 |
with gr.Row():
|
@@ -162,7 +158,7 @@ with gr.Blocks() as demo:
|
|
162 |
output_text = gr.Textbox(label="Output", lines=10)
|
163 |
run_button = gr.Button("Process Video")
|
164 |
|
165 |
-
run_button.click(fn=process_video, inputs=[
|
166 |
|
167 |
# -------------------------------------------------
|
168 |
# 7. Launch the App
|
|
|
9 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
from qwen_vl_utils import process_vision_info
|
11 |
import gradio as gr
|
12 |
+
# Removed pytube since we no longer download from YouTube
|
13 |
|
14 |
# ----------------------------------------
|
15 |
+
# 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
|
16 |
# ----------------------------------------
|
17 |
model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
|
18 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
19 |
model_path,
|
20 |
+
torch_dtype=torch.float16 # use float16 on CPU if desired, else use float32
|
21 |
# Removed attn_implementation and device_map for CPU-only deployment
|
22 |
)
|
23 |
processor = AutoProcessor.from_pretrained(model_path)
|
|
|
25 |
# -------------------------------------------------
|
26 |
# 2. Define Utility Functions for Video Processing
|
27 |
# -------------------------------------------------
|
|
|
28 |
def download_video(url, dest_path):
|
29 |
"""
|
30 |
Download a non-YouTube video using requests.
|
31 |
+
(This function is retained if you need it later.)
|
32 |
"""
|
33 |
response = requests.get(url, stream=True)
|
34 |
with open(dest_path, 'wb') as f:
|
|
|
38 |
|
39 |
def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
40 |
"""
|
41 |
+
Extract frames and timestamps from a video file.
|
42 |
+
If the video_path is a URL, it will download it.
|
43 |
+
For local files (including uploaded videos), it processes directly.
|
44 |
Uses caching to avoid repeated processing.
|
45 |
"""
|
46 |
os.makedirs(cache_dir, exist_ok=True)
|
47 |
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
48 |
|
49 |
+
# If video_path starts with 'http', attempt to download
|
50 |
+
if video_path.startswith('http'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
52 |
if not os.path.exists(video_file_path):
|
53 |
print("Downloading video using requests...")
|
54 |
download_video(video_path, video_file_path)
|
55 |
else:
|
56 |
+
# For local files (uploaded videos), use the provided path directly.
|
57 |
video_file_path = video_path
|
58 |
|
59 |
+
# Check for cached frames
|
60 |
frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
|
61 |
timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
|
62 |
if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
|
|
|
71 |
frames = vr.get_batch(indices).asnumpy()
|
72 |
timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
|
73 |
|
74 |
+
# Save to cache
|
75 |
np.save(frames_cache_file, frames)
|
76 |
np.save(timestamps_cache_file, timestamps)
|
77 |
|
|
|
82 |
# --------------------------------------------------------
|
83 |
def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
|
84 |
"""
|
85 |
+
Prepares the input messages with the prompt and video metadata,
|
86 |
+
processes the video inputs, and runs inference through the model.
|
87 |
"""
|
88 |
messages = [
|
89 |
{"role": "system", "content": "You are a helpful assistant."},
|
|
|
103 |
padding=True,
|
104 |
return_tensors="pt"
|
105 |
)
|
106 |
+
# In CPU-only mode, we use the default device (no .to('cuda'))
|
107 |
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
108 |
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
109 |
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
|
|
121 |
# -------------------------------------------------
|
122 |
# 5. Main Processing Function for the Gradio Interface
|
123 |
# -------------------------------------------------
|
124 |
+
def process_video(video_file, custom_prompt, sample_prompt):
|
125 |
"""
|
126 |
+
Called when the user clicks 'Process Video'.
|
127 |
+
Uses the custom prompt if provided; otherwise, uses the sample prompt.
|
128 |
+
Processes the uploaded video file and runs inference.
|
129 |
"""
|
130 |
final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
|
131 |
try:
|
132 |
+
# video_file is expected to be a local file path from the uploader.
|
133 |
+
video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
|
134 |
except Exception as e:
|
135 |
return f"Error processing video: {str(e)}"
|
136 |
|
|
|
145 |
# 6. Build the Gradio Interface
|
146 |
# -------------------------------------------------
|
147 |
with gr.Blocks() as demo:
|
148 |
+
gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
|
149 |
+
gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
|
150 |
|
151 |
with gr.Row():
|
152 |
+
video_input = gr.Video(label="Upload Video", source="upload")
|
153 |
with gr.Row():
|
154 |
custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
|
155 |
with gr.Row():
|
|
|
158 |
output_text = gr.Textbox(label="Output", lines=10)
|
159 |
run_button = gr.Button("Process Video")
|
160 |
|
161 |
+
run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
|
162 |
|
163 |
# -------------------------------------------------
|
164 |
# 7. Launch the App
|