Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -56,31 +56,51 @@ class YouTubeDownloader:
|
|
56 |
try:
|
57 |
duration = video_info.get('duration', 0)
|
58 |
title = video_info.get('title', '')
|
59 |
-
description = video_info.get('description', '')[:
|
60 |
|
61 |
if not duration:
|
62 |
return ["**[Duration Unknown]**: Unable to generate timestamped breakdown - video duration not available"]
|
63 |
|
64 |
-
# Create prompt for Gemini
|
65 |
prompt = f"""
|
66 |
-
Analyze this YouTube video and create a detailed scene-by-scene breakdown with timestamps:
|
67 |
|
68 |
Title: {title}
|
69 |
Duration: {duration} seconds
|
70 |
Description: {description}
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
"""
|
85 |
|
86 |
response = self.gemini_model.generate_content(prompt)
|
@@ -89,11 +109,28 @@ class YouTubeDownloader:
|
|
89 |
if response and response.text:
|
90 |
scenes = []
|
91 |
lines = response.text.split('\n')
|
|
|
|
|
92 |
for line in lines:
|
93 |
line = line.strip()
|
94 |
-
if line
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
|
|
|
|
|
|
|
|
97 |
return scenes if scenes else self.generate_scene_breakdown_fallback(video_info)
|
98 |
else:
|
99 |
return self.generate_scene_breakdown_fallback(video_info)
|
@@ -103,23 +140,30 @@ class YouTubeDownloader:
|
|
103 |
return self.generate_scene_breakdown_fallback(video_info)
|
104 |
|
105 |
def generate_scene_breakdown_fallback(self, video_info):
|
106 |
-
"""
|
107 |
duration = video_info.get('duration', 0)
|
108 |
title = video_info.get('title', '').lower()
|
|
|
|
|
109 |
|
110 |
if not duration:
|
111 |
return ["**[Duration Unknown]**: Unable to generate timestamped breakdown"]
|
112 |
|
113 |
-
#
|
114 |
-
if duration <=
|
115 |
-
segment_length =
|
116 |
-
elif duration <=
|
117 |
-
segment_length =
|
|
|
|
|
118 |
else:
|
119 |
-
segment_length =
|
120 |
|
121 |
scenes = []
|
122 |
-
num_segments = min(duration // segment_length + 1,
|
|
|
|
|
|
|
123 |
|
124 |
for i in range(num_segments):
|
125 |
start_time = i * segment_length
|
@@ -128,17 +172,88 @@ class YouTubeDownloader:
|
|
128 |
start_formatted = f"{start_time//60}:{start_time%60:02d}"
|
129 |
end_formatted = f"{end_time//60}:{end_time%60:02d}"
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
elif i == num_segments - 1:
|
134 |
-
desc = "Conclusion with final thoughts and call-to-action"
|
135 |
-
else:
|
136 |
-
desc = f"Main content segment {i} with key information and details"
|
137 |
|
138 |
scenes.append(f"**[{start_formatted}-{end_formatted}]**: {desc}")
|
139 |
|
140 |
return scenes
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
def detect_video_type(self, title, description):
|
143 |
"""Detect video type based on title and description"""
|
144 |
text = (title + " " + description).lower()
|
@@ -560,7 +675,6 @@ def create_interface():
|
|
560 |
inputs=[api_key_input],
|
561 |
outputs=[api_status, main_interface]
|
562 |
)
|
563 |
-
|
564 |
# Always show interface option (for fallback mode)
|
565 |
with gr.Row():
|
566 |
show_interface_btn = gr.Button("🚀 Use Without Gemini API (Fallback Mode)", variant="secondary")
|
|
|
56 |
try:
|
57 |
duration = video_info.get('duration', 0)
|
58 |
title = video_info.get('title', '')
|
59 |
+
description = video_info.get('description', '')[:1500] # Increased limit for better context
|
60 |
|
61 |
if not duration:
|
62 |
return ["**[Duration Unknown]**: Unable to generate timestamped breakdown - video duration not available"]
|
63 |
|
64 |
+
# Create enhanced prompt for Gemini
|
65 |
prompt = f"""
|
66 |
+
Analyze this YouTube video and create a highly detailed, scene-by-scene breakdown with precise timestamps and specific descriptions:
|
67 |
|
68 |
Title: {title}
|
69 |
Duration: {duration} seconds
|
70 |
Description: {description}
|
71 |
|
72 |
+
IMPORTANT INSTRUCTIONS:
|
73 |
+
1. Create detailed scene descriptions that include:
|
74 |
+
- Physical appearance of people (age, gender, clothing, hair, etc.)
|
75 |
+
- Exact actions being performed
|
76 |
+
- Dialogue or speech (if mentioned in title/description)
|
77 |
+
- Setting and environment details
|
78 |
+
- Props, objects, or products being shown
|
79 |
+
- Visual effects, text overlays, or graphics
|
80 |
+
- Mood, tone, and atmosphere
|
81 |
+
- Camera movements or angles (if apparent)
|
82 |
+
|
83 |
+
2. Timestamp Guidelines:
|
84 |
+
- For videos under 1 minute: 2-3 second segments
|
85 |
+
- For videos 1-5 minutes: 3-5 second segments
|
86 |
+
- For videos 5-15 minutes: 5-10 second segments
|
87 |
+
- For videos over 15 minutes: 10-15 second segments
|
88 |
+
- Maximum 20 scenes total for longer videos
|
89 |
+
|
90 |
+
3. Format each scene EXACTLY like this:
|
91 |
+
**[MM:SS-MM:SS]**: Detailed description including who is visible, what they're wearing, what they're doing, what they're saying (if applicable), setting details, objects shown, and any visual elements.
|
92 |
+
|
93 |
+
4. Be specific about:
|
94 |
+
- Character descriptions (appearance, clothing, expressions)
|
95 |
+
- Actions and movements
|
96 |
+
- Objects, products, or props being displayed
|
97 |
+
- Setting and background details
|
98 |
+
- Any text, graphics, or overlays
|
99 |
+
- Transitions between scenes
|
100 |
+
|
101 |
+
5. Write descriptions as if you're watching the video in real-time, noting everything visible and audible.
|
102 |
+
|
103 |
+
Based on the title and description, intelligently infer what would likely happen in each time segment. Consider the video type and create contextually appropriate, detailed descriptions.
|
104 |
"""
|
105 |
|
106 |
response = self.gemini_model.generate_content(prompt)
|
|
|
109 |
if response and response.text:
|
110 |
scenes = []
|
111 |
lines = response.text.split('\n')
|
112 |
+
current_scene = ""
|
113 |
+
|
114 |
for line in lines:
|
115 |
line = line.strip()
|
116 |
+
if line.startswith('**[') and ']:' in line:
|
117 |
+
# This is a new scene timestamp line
|
118 |
+
if current_scene:
|
119 |
+
scenes.append(current_scene.strip())
|
120 |
+
current_scene = line
|
121 |
+
elif current_scene and line:
|
122 |
+
# This is continuation of the current scene description
|
123 |
+
current_scene += " " + line
|
124 |
+
elif line.startswith('*') and '[' in line:
|
125 |
+
# Alternative format handling
|
126 |
+
if current_scene:
|
127 |
+
scenes.append(current_scene.strip())
|
128 |
+
current_scene = line
|
129 |
|
130 |
+
# Add the last scene if exists
|
131 |
+
if current_scene:
|
132 |
+
scenes.append(current_scene.strip())
|
133 |
+
|
134 |
return scenes if scenes else self.generate_scene_breakdown_fallback(video_info)
|
135 |
else:
|
136 |
return self.generate_scene_breakdown_fallback(video_info)
|
|
|
140 |
return self.generate_scene_breakdown_fallback(video_info)
|
141 |
|
142 |
def generate_scene_breakdown_fallback(self, video_info):
|
143 |
+
"""Enhanced fallback scene generation when Gemini is not available"""
|
144 |
duration = video_info.get('duration', 0)
|
145 |
title = video_info.get('title', '').lower()
|
146 |
+
description = video_info.get('description', '').lower()
|
147 |
+
uploader = video_info.get('uploader', 'Content creator')
|
148 |
|
149 |
if not duration:
|
150 |
return ["**[Duration Unknown]**: Unable to generate timestamped breakdown"]
|
151 |
|
152 |
+
# Determine segment length based on duration
|
153 |
+
if duration <= 60:
|
154 |
+
segment_length = 3
|
155 |
+
elif duration <= 300:
|
156 |
+
segment_length = 5
|
157 |
+
elif duration <= 900:
|
158 |
+
segment_length = 10
|
159 |
else:
|
160 |
+
segment_length = 15
|
161 |
|
162 |
scenes = []
|
163 |
+
num_segments = min(duration // segment_length + 1, 20)
|
164 |
+
|
165 |
+
# Detect video type for better descriptions
|
166 |
+
video_type = self.detect_video_type_detailed(title, description)
|
167 |
|
168 |
for i in range(num_segments):
|
169 |
start_time = i * segment_length
|
|
|
172 |
start_formatted = f"{start_time//60}:{start_time%60:02d}"
|
173 |
end_formatted = f"{end_time//60}:{end_time%60:02d}"
|
174 |
|
175 |
+
# Generate contextual descriptions based on video type and timing
|
176 |
+
desc = self.generate_contextual_description(i, num_segments, video_type, uploader, title)
|
|
|
|
|
|
|
|
|
177 |
|
178 |
scenes.append(f"**[{start_formatted}-{end_formatted}]**: {desc}")
|
179 |
|
180 |
return scenes
|
181 |
|
182 |
+
def detect_video_type_detailed(self, title, description):
|
183 |
+
"""Detect video type with more detail for better fallback descriptions"""
|
184 |
+
text = (title + " " + description).lower()
|
185 |
+
|
186 |
+
if any(word in text for word in ['tutorial', 'how to', 'guide', 'learn', 'diy', 'step by step']):
|
187 |
+
return 'tutorial'
|
188 |
+
elif any(word in text for word in ['review', 'unboxing', 'test', 'comparison', 'vs']):
|
189 |
+
return 'review'
|
190 |
+
elif any(word in text for word in ['vlog', 'daily', 'routine', 'day in', 'morning', 'skincare']):
|
191 |
+
return 'vlog'
|
192 |
+
elif any(word in text for word in ['music', 'song', 'cover', 'lyrics', 'dance']):
|
193 |
+
return 'music'
|
194 |
+
elif any(word in text for word in ['comedy', 'funny', 'prank', 'challenge', 'reaction']):
|
195 |
+
return 'entertainment'
|
196 |
+
elif any(word in text for word in ['news', 'breaking', 'update', 'report']):
|
197 |
+
return 'news'
|
198 |
+
elif any(word in text for word in ['cooking', 'recipe', 'food', 'kitchen']):
|
199 |
+
return 'cooking'
|
200 |
+
elif any(word in text for word in ['workout', 'fitness', 'exercise', 'yoga']):
|
201 |
+
return 'fitness'
|
202 |
+
else:
|
203 |
+
return 'general'
|
204 |
+
|
205 |
+
def generate_contextual_description(self, scene_index, total_scenes, video_type, uploader, title):
|
206 |
+
"""Generate contextual descriptions based on video type and scene position"""
|
207 |
+
|
208 |
+
# Common elements
|
209 |
+
presenter_desc = f"The content creator"
|
210 |
+
if 'woman' in title.lower() or 'girl' in title.lower():
|
211 |
+
presenter_desc = "A woman"
|
212 |
+
elif 'man' in title.lower() or 'guy' in title.lower():
|
213 |
+
presenter_desc = "A man"
|
214 |
+
|
215 |
+
# Position-based descriptions
|
216 |
+
if scene_index == 0:
|
217 |
+
# Opening scene
|
218 |
+
if video_type == 'tutorial':
|
219 |
+
return f"{presenter_desc} appears on screen, likely introducing themselves and the topic. They may be in a well-lit indoor setting, wearing casual clothing, and addressing the camera directly with a welcoming gesture."
|
220 |
+
elif video_type == 'vlog':
|
221 |
+
return f"{presenter_desc} greets the camera with a smile, possibly waving. They appear to be in their usual filming location, wearing their typical style, and beginning their introduction to today's content."
|
222 |
+
elif video_type == 'review':
|
223 |
+
return f"{presenter_desc} introduces the product or topic they'll be reviewing, likely holding or displaying the item. The setting appears organized, possibly with the product prominently featured."
|
224 |
+
else:
|
225 |
+
return f"{presenter_desc} appears on screen to begin the video, introducing the topic with engaging body language and clear speech directed at the audience."
|
226 |
+
|
227 |
+
elif scene_index == total_scenes - 1:
|
228 |
+
# Closing scene
|
229 |
+
if video_type == 'tutorial':
|
230 |
+
return f"{presenter_desc} concludes the tutorial, possibly showing the final result. They may be thanking viewers, asking for engagement (likes/comments), and suggesting related content."
|
231 |
+
elif video_type == 'vlog':
|
232 |
+
return f"{presenter_desc} wraps up their vlog, possibly reflecting on the day's events. They appear relaxed and are likely saying goodbye to viewers with a friendly gesture."
|
233 |
+
else:
|
234 |
+
return f"{presenter_desc} concludes the video with final thoughts, thanking viewers for watching, and encouraging engagement through likes, comments, and subscriptions."
|
235 |
+
|
236 |
+
else:
|
237 |
+
# Middle scenes - content-specific
|
238 |
+
if video_type == 'tutorial':
|
239 |
+
step_num = scene_index
|
240 |
+
return f"{presenter_desc} demonstrates step {step_num} of the process, showing specific techniques and explaining the procedure. They may be using tools or materials, with close-up shots of their hands working."
|
241 |
+
|
242 |
+
elif video_type == 'review':
|
243 |
+
return f"{presenter_desc} examines different aspects of the product, pointing out features and sharing their opinions. They may be holding, using, or demonstrating the item while speaking to the camera."
|
244 |
+
|
245 |
+
elif video_type == 'vlog':
|
246 |
+
return f"{presenter_desc} continues sharing their experience, possibly showing different locations or activities. The scene captures candid moments with natural lighting and casual interactions."
|
247 |
+
|
248 |
+
elif video_type == 'cooking':
|
249 |
+
return f"{presenter_desc} works in the kitchen, preparing ingredients or cooking. They demonstrate techniques while explaining each step, with kitchen tools and ingredients visible on the counter."
|
250 |
+
|
251 |
+
elif video_type == 'fitness':
|
252 |
+
return f"{presenter_desc} demonstrates exercise movements, likely in workout attire in a gym or home setting. They show proper form while providing instruction and motivation."
|
253 |
+
|
254 |
+
else:
|
255 |
+
return f"{presenter_desc} continues with the main content, engaging with the audience through clear explanations and demonstrations. The setting remains consistent with good lighting and clear audio."
|
256 |
+
|
257 |
def detect_video_type(self, title, description):
|
258 |
"""Detect video type based on title and description"""
|
259 |
text = (title + " " + description).lower()
|
|
|
675 |
inputs=[api_key_input],
|
676 |
outputs=[api_status, main_interface]
|
677 |
)
|
|
|
678 |
# Always show interface option (for fallback mode)
|
679 |
with gr.Row():
|
680 |
show_interface_btn = gr.Button("🚀 Use Without Gemini API (Fallback Mode)", variant="secondary")
|