Spaces:
Running
Running
Commit
·
226eb77
1
Parent(s):
753ac1a
Appropriate for youtube original link format.
Browse files- app.py +24 -21
- requirements.txt +2 -1
app.py
CHANGED
@@ -9,14 +9,16 @@ import torch
|
|
9 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
from qwen_vl_utils import process_vision_info
|
11 |
import gradio as gr
|
|
|
12 |
|
13 |
# ----------------------------------------
|
14 |
-
# 1. Initialize the Qwen 2.5 VL Model (7B)
|
15 |
# ----------------------------------------
|
16 |
-
# We load the official 7B version, using flash attention optimization and bfloat16 for efficiency.
|
17 |
model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
|
18 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
19 |
-
model_path
|
|
|
|
|
20 |
)
|
21 |
processor = AutoProcessor.from_pretrained(model_path)
|
22 |
|
@@ -26,7 +28,7 @@ processor = AutoProcessor.from_pretrained(model_path)
|
|
26 |
|
27 |
def download_video(url, dest_path):
|
28 |
"""
|
29 |
-
Download
|
30 |
"""
|
31 |
response = requests.get(url, stream=True)
|
32 |
with open(dest_path, 'wb') as f:
|
@@ -37,16 +39,25 @@ def download_video(url, dest_path):
|
|
37 |
def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
38 |
"""
|
39 |
Download (if needed) and extract frames and timestamps from the video.
|
40 |
-
|
41 |
-
|
42 |
"""
|
43 |
os.makedirs(cache_dir, exist_ok=True)
|
44 |
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
45 |
-
|
46 |
-
#
|
47 |
-
if video_path.startswith('http
|
48 |
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
49 |
if not os.path.exists(video_file_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
download_video(video_path, video_file_path)
|
51 |
else:
|
52 |
video_file_path = video_path
|
@@ -87,9 +98,7 @@ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 *
|
|
87 |
{"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
|
88 |
]},
|
89 |
]
|
90 |
-
# Prepare the text with the chat template from the processor.
|
91 |
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
92 |
-
# Process the video information into the proper inputs.
|
93 |
image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
|
94 |
fps_inputs = video_kwargs['fps']
|
95 |
inputs = processor(
|
@@ -100,11 +109,8 @@ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 *
|
|
100 |
padding=True,
|
101 |
return_tensors="pt"
|
102 |
)
|
103 |
-
#
|
104 |
-
|
105 |
-
# Generate the response using the model
|
106 |
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
107 |
-
# Post-process the output tokens to text.
|
108 |
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
109 |
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
110 |
return output_text[0]
|
@@ -123,11 +129,9 @@ sample_prompts = [
|
|
123 |
# -------------------------------------------------
|
124 |
def process_video(video_url, custom_prompt, sample_prompt):
|
125 |
"""
|
126 |
-
|
127 |
-
|
128 |
-
- It then downloads and processes the video and calls the inference function.
|
129 |
"""
|
130 |
-
# Choose the prompt: use custom prompt if not empty, else use the sample
|
131 |
final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
|
132 |
try:
|
133 |
video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
|
@@ -145,7 +149,7 @@ def process_video(video_url, custom_prompt, sample_prompt):
|
|
145 |
# 6. Build the Gradio Interface
|
146 |
# -------------------------------------------------
|
147 |
with gr.Blocks() as demo:
|
148 |
-
gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B)")
|
149 |
gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
|
150 |
|
151 |
with gr.Row():
|
@@ -158,7 +162,6 @@ with gr.Blocks() as demo:
|
|
158 |
output_text = gr.Textbox(label="Output", lines=10)
|
159 |
run_button = gr.Button("Process Video")
|
160 |
|
161 |
-
# When the button is clicked, run the process_video function.
|
162 |
run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
|
163 |
|
164 |
# -------------------------------------------------
|
|
|
9 |
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
from qwen_vl_utils import process_vision_info
|
11 |
import gradio as gr
|
12 |
+
from pytube import YouTube # new import for handling YouTube links
|
13 |
|
14 |
# ----------------------------------------
|
15 |
+
# 1. Initialize the Qwen 2.5 VL Model (7B) for CPU-only
|
16 |
# ----------------------------------------
|
|
|
17 |
model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
|
18 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
19 |
+
model_path,
|
20 |
+
torch_dtype=torch.float16 # use float32 for CPU
|
21 |
+
# Removed attn_implementation and device_map for CPU-only deployment
|
22 |
)
|
23 |
processor = AutoProcessor.from_pretrained(model_path)
|
24 |
|
|
|
28 |
|
29 |
def download_video(url, dest_path):
|
30 |
"""
|
31 |
+
Download a non-YouTube video using requests.
|
32 |
"""
|
33 |
response = requests.get(url, stream=True)
|
34 |
with open(dest_path, 'wb') as f:
|
|
|
39 |
def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
40 |
"""
|
41 |
Download (if needed) and extract frames and timestamps from the video.
|
42 |
+
If the URL is a YouTube link, use pytube to download the video.
|
43 |
+
Uses caching to avoid repeated processing.
|
44 |
"""
|
45 |
os.makedirs(cache_dir, exist_ok=True)
|
46 |
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
47 |
+
|
48 |
+
# Check if the video is a YouTube link
|
49 |
+
if video_path.startswith('http') and ("youtube.com" in video_path or "youtu.be" in video_path):
|
50 |
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
51 |
if not os.path.exists(video_file_path):
|
52 |
+
print("Downloading YouTube video using pytube...")
|
53 |
+
yt = YouTube(video_path)
|
54 |
+
stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
|
55 |
+
stream.download(output_path=cache_dir, filename=f'{video_hash}.mp4')
|
56 |
+
# Otherwise, if it's a direct link to an mp4 or a local path
|
57 |
+
elif video_path.startswith('http'):
|
58 |
+
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
59 |
+
if not os.path.exists(video_file_path):
|
60 |
+
print("Downloading video using requests...")
|
61 |
download_video(video_path, video_file_path)
|
62 |
else:
|
63 |
video_file_path = video_path
|
|
|
98 |
{"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
|
99 |
]},
|
100 |
]
|
|
|
101 |
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
|
102 |
image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
|
103 |
fps_inputs = video_kwargs['fps']
|
104 |
inputs = processor(
|
|
|
109 |
padding=True,
|
110 |
return_tensors="pt"
|
111 |
)
|
112 |
+
# Do not move to GPU on CPU-only environment
|
|
|
|
|
113 |
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
|
114 |
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
115 |
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
116 |
return output_text[0]
|
|
|
129 |
# -------------------------------------------------
|
130 |
def process_video(video_url, custom_prompt, sample_prompt):
|
131 |
"""
|
132 |
+
Called when the user clicks 'Process Video'. Uses the custom prompt if provided; otherwise uses the sample prompt.
|
133 |
+
Downloads and processes the video then runs the inference.
|
|
|
134 |
"""
|
|
|
135 |
final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
|
136 |
try:
|
137 |
video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
|
|
|
149 |
# 6. Build the Gradio Interface
|
150 |
# -------------------------------------------------
|
151 |
with gr.Blocks() as demo:
|
152 |
+
gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B) on CPU")
|
153 |
gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
|
154 |
|
155 |
with gr.Row():
|
|
|
162 |
output_text = gr.Textbox(label="Output", lines=10)
|
163 |
run_button = gr.Button("Process Video")
|
164 |
|
|
|
165 |
run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
|
166 |
|
167 |
# -------------------------------------------------
|
requirements.txt
CHANGED
@@ -16,4 +16,5 @@ av
|
|
16 |
decord
|
17 |
numpy
|
18 |
Pillow
|
19 |
-
requests
|
|
|
|
16 |
decord
|
17 |
numpy
|
18 |
Pillow
|
19 |
+
requests
|
20 |
+
pytube
|