developer28 commited on
Commit
95cc944
·
verified ·
1 Parent(s): 2c78469

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -32
app.py CHANGED
@@ -56,31 +56,51 @@ class YouTubeDownloader:
56
  try:
57
  duration = video_info.get('duration', 0)
58
  title = video_info.get('title', '')
59
- description = video_info.get('description', '')[:1000] # Limit description length
60
 
61
  if not duration:
62
  return ["**[Duration Unknown]**: Unable to generate timestamped breakdown - video duration not available"]
63
 
64
- # Create prompt for Gemini
65
  prompt = f"""
66
- Analyze this YouTube video and create a detailed scene-by-scene breakdown with timestamps:
67
 
68
  Title: {title}
69
  Duration: {duration} seconds
70
  Description: {description}
71
 
72
- Please provide a scene breakdown with the following format:
73
- - Divide the video into logical segments based on typical content flow
74
- - For videos under 2 minutes: 10-15 second segments
75
- - For videos 2-10 minutes: 30-45 second segments
76
- - For videos over 10 minutes: 60-90 second segments
77
- - Maximum 15 scenes total
78
-
79
- For each scene, provide:
80
- **[START_TIME-END_TIME]**: Detailed description of what likely happens in this segment, including visual elements, audio cues, potential dialogue or narration, and scene transitions.
81
-
82
- Consider the video type (tutorial, music video, vlog, etc.) and provide contextually appropriate descriptions.
83
- Format timestamps as MM:SS.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  """
85
 
86
  response = self.gemini_model.generate_content(prompt)
@@ -89,11 +109,28 @@ class YouTubeDownloader:
89
  if response and response.text:
90
  scenes = []
91
  lines = response.text.split('\n')
 
 
92
  for line in lines:
93
  line = line.strip()
94
- if line and ('**[' in line or line.startswith('*')):
95
- scenes.append(line)
 
 
 
 
 
 
 
 
 
 
 
96
 
 
 
 
 
97
  return scenes if scenes else self.generate_scene_breakdown_fallback(video_info)
98
  else:
99
  return self.generate_scene_breakdown_fallback(video_info)
@@ -103,23 +140,30 @@ class YouTubeDownloader:
103
  return self.generate_scene_breakdown_fallback(video_info)
104
 
105
  def generate_scene_breakdown_fallback(self, video_info):
106
- """Fallback scene generation when Gemini is not available"""
107
  duration = video_info.get('duration', 0)
108
  title = video_info.get('title', '').lower()
 
 
109
 
110
  if not duration:
111
  return ["**[Duration Unknown]**: Unable to generate timestamped breakdown"]
112
 
113
- # Simple fallback logic
114
- if duration <= 120:
115
- segment_length = 15
116
- elif duration <= 600:
117
- segment_length = 45
 
 
118
  else:
119
- segment_length = 90
120
 
121
  scenes = []
122
- num_segments = min(duration // segment_length + 1, 15)
 
 
 
123
 
124
  for i in range(num_segments):
125
  start_time = i * segment_length
@@ -128,17 +172,88 @@ class YouTubeDownloader:
128
  start_formatted = f"{start_time//60}:{start_time%60:02d}"
129
  end_formatted = f"{end_time//60}:{end_time%60:02d}"
130
 
131
- if i == 0:
132
- desc = "Opening sequence with introduction and setup"
133
- elif i == num_segments - 1:
134
- desc = "Conclusion with final thoughts and call-to-action"
135
- else:
136
- desc = f"Main content segment {i} with key information and details"
137
 
138
  scenes.append(f"**[{start_formatted}-{end_formatted}]**: {desc}")
139
 
140
  return scenes
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def detect_video_type(self, title, description):
143
  """Detect video type based on title and description"""
144
  text = (title + " " + description).lower()
@@ -560,7 +675,6 @@ def create_interface():
560
  inputs=[api_key_input],
561
  outputs=[api_status, main_interface]
562
  )
563
-
564
  # Always show interface option (for fallback mode)
565
  with gr.Row():
566
  show_interface_btn = gr.Button("🚀 Use Without Gemini API (Fallback Mode)", variant="secondary")
 
56
  try:
57
  duration = video_info.get('duration', 0)
58
  title = video_info.get('title', '')
59
+ description = video_info.get('description', '')[:1500] # Increased limit for better context
60
 
61
  if not duration:
62
  return ["**[Duration Unknown]**: Unable to generate timestamped breakdown - video duration not available"]
63
 
64
+ # Create enhanced prompt for Gemini
65
  prompt = f"""
66
+ Analyze this YouTube video and create a highly detailed, scene-by-scene breakdown with precise timestamps and specific descriptions:
67
 
68
  Title: {title}
69
  Duration: {duration} seconds
70
  Description: {description}
71
 
72
+ IMPORTANT INSTRUCTIONS:
73
+ 1. Create detailed scene descriptions that include:
74
+ - Physical appearance of people (age, gender, clothing, hair, etc.)
75
+ - Exact actions being performed
76
+ - Dialogue or speech (if mentioned in title/description)
77
+ - Setting and environment details
78
+ - Props, objects, or products being shown
79
+ - Visual effects, text overlays, or graphics
80
+ - Mood, tone, and atmosphere
81
+ - Camera movements or angles (if apparent)
82
+
83
+ 2. Timestamp Guidelines:
84
+ - For videos under 1 minute: 2-3 second segments
85
+ - For videos 1-5 minutes: 3-5 second segments
86
+ - For videos 5-15 minutes: 5-10 second segments
87
+ - For videos over 15 minutes: 10-15 second segments
88
+ - Maximum 20 scenes total for longer videos
89
+
90
+ 3. Format each scene EXACTLY like this:
91
+ **[MM:SS-MM:SS]**: Detailed description including who is visible, what they're wearing, what they're doing, what they're saying (if applicable), setting details, objects shown, and any visual elements.
92
+
93
+ 4. Be specific about:
94
+ - Character descriptions (appearance, clothing, expressions)
95
+ - Actions and movements
96
+ - Objects, products, or props being displayed
97
+ - Setting and background details
98
+ - Any text, graphics, or overlays
99
+ - Transitions between scenes
100
+
101
+ 5. Write descriptions as if you're watching the video in real-time, noting everything visible and audible.
102
+
103
+ Based on the title and description, intelligently infer what would likely happen in each time segment. Consider the video type and create contextually appropriate, detailed descriptions.
104
  """
105
 
106
  response = self.gemini_model.generate_content(prompt)
 
109
  if response and response.text:
110
  scenes = []
111
  lines = response.text.split('\n')
112
+ current_scene = ""
113
+
114
  for line in lines:
115
  line = line.strip()
116
+ if line.startswith('**[') and ']:' in line:
117
+ # This is a new scene timestamp line
118
+ if current_scene:
119
+ scenes.append(current_scene.strip())
120
+ current_scene = line
121
+ elif current_scene and line:
122
+ # This is continuation of the current scene description
123
+ current_scene += " " + line
124
+ elif line.startswith('*') and '[' in line:
125
+ # Alternative format handling
126
+ if current_scene:
127
+ scenes.append(current_scene.strip())
128
+ current_scene = line
129
 
130
+ # Add the last scene if exists
131
+ if current_scene:
132
+ scenes.append(current_scene.strip())
133
+
134
  return scenes if scenes else self.generate_scene_breakdown_fallback(video_info)
135
  else:
136
  return self.generate_scene_breakdown_fallback(video_info)
 
140
  return self.generate_scene_breakdown_fallback(video_info)
141
 
142
  def generate_scene_breakdown_fallback(self, video_info):
143
+ """Enhanced fallback scene generation when Gemini is not available"""
144
  duration = video_info.get('duration', 0)
145
  title = video_info.get('title', '').lower()
146
+ description = video_info.get('description', '').lower()
147
+ uploader = video_info.get('uploader', 'Content creator')
148
 
149
  if not duration:
150
  return ["**[Duration Unknown]**: Unable to generate timestamped breakdown"]
151
 
152
+ # Determine segment length based on duration
153
+ if duration <= 60:
154
+ segment_length = 3
155
+ elif duration <= 300:
156
+ segment_length = 5
157
+ elif duration <= 900:
158
+ segment_length = 10
159
  else:
160
+ segment_length = 15
161
 
162
  scenes = []
163
+ num_segments = min(duration // segment_length + 1, 20)
164
+
165
+ # Detect video type for better descriptions
166
+ video_type = self.detect_video_type_detailed(title, description)
167
 
168
  for i in range(num_segments):
169
  start_time = i * segment_length
 
172
  start_formatted = f"{start_time//60}:{start_time%60:02d}"
173
  end_formatted = f"{end_time//60}:{end_time%60:02d}"
174
 
175
+ # Generate contextual descriptions based on video type and timing
176
+ desc = self.generate_contextual_description(i, num_segments, video_type, uploader, title)
 
 
 
 
177
 
178
  scenes.append(f"**[{start_formatted}-{end_formatted}]**: {desc}")
179
 
180
  return scenes
181
 
182
+ def detect_video_type_detailed(self, title, description):
183
+ """Detect video type with more detail for better fallback descriptions"""
184
+ text = (title + " " + description).lower()
185
+
186
+ if any(word in text for word in ['tutorial', 'how to', 'guide', 'learn', 'diy', 'step by step']):
187
+ return 'tutorial'
188
+ elif any(word in text for word in ['review', 'unboxing', 'test', 'comparison', 'vs']):
189
+ return 'review'
190
+ elif any(word in text for word in ['vlog', 'daily', 'routine', 'day in', 'morning', 'skincare']):
191
+ return 'vlog'
192
+ elif any(word in text for word in ['music', 'song', 'cover', 'lyrics', 'dance']):
193
+ return 'music'
194
+ elif any(word in text for word in ['comedy', 'funny', 'prank', 'challenge', 'reaction']):
195
+ return 'entertainment'
196
+ elif any(word in text for word in ['news', 'breaking', 'update', 'report']):
197
+ return 'news'
198
+ elif any(word in text for word in ['cooking', 'recipe', 'food', 'kitchen']):
199
+ return 'cooking'
200
+ elif any(word in text for word in ['workout', 'fitness', 'exercise', 'yoga']):
201
+ return 'fitness'
202
+ else:
203
+ return 'general'
204
+
205
+ def generate_contextual_description(self, scene_index, total_scenes, video_type, uploader, title):
206
+ """Generate contextual descriptions based on video type and scene position"""
207
+
208
+ # Common elements
209
+ presenter_desc = f"The content creator"
210
+ if 'woman' in title.lower() or 'girl' in title.lower():
211
+ presenter_desc = "A woman"
212
+ elif 'man' in title.lower() or 'guy' in title.lower():
213
+ presenter_desc = "A man"
214
+
215
+ # Position-based descriptions
216
+ if scene_index == 0:
217
+ # Opening scene
218
+ if video_type == 'tutorial':
219
+ return f"{presenter_desc} appears on screen, likely introducing themselves and the topic. They may be in a well-lit indoor setting, wearing casual clothing, and addressing the camera directly with a welcoming gesture."
220
+ elif video_type == 'vlog':
221
+ return f"{presenter_desc} greets the camera with a smile, possibly waving. They appear to be in their usual filming location, wearing their typical style, and beginning their introduction to today's content."
222
+ elif video_type == 'review':
223
+ return f"{presenter_desc} introduces the product or topic they'll be reviewing, likely holding or displaying the item. The setting appears organized, possibly with the product prominently featured."
224
+ else:
225
+ return f"{presenter_desc} appears on screen to begin the video, introducing the topic with engaging body language and clear speech directed at the audience."
226
+
227
+ elif scene_index == total_scenes - 1:
228
+ # Closing scene
229
+ if video_type == 'tutorial':
230
+ return f"{presenter_desc} concludes the tutorial, possibly showing the final result. They may be thanking viewers, asking for engagement (likes/comments), and suggesting related content."
231
+ elif video_type == 'vlog':
232
+ return f"{presenter_desc} wraps up their vlog, possibly reflecting on the day's events. They appear relaxed and are likely saying goodbye to viewers with a friendly gesture."
233
+ else:
234
+ return f"{presenter_desc} concludes the video with final thoughts, thanking viewers for watching, and encouraging engagement through likes, comments, and subscriptions."
235
+
236
+ else:
237
+ # Middle scenes - content-specific
238
+ if video_type == 'tutorial':
239
+ step_num = scene_index
240
+ return f"{presenter_desc} demonstrates step {step_num} of the process, showing specific techniques and explaining the procedure. They may be using tools or materials, with close-up shots of their hands working."
241
+
242
+ elif video_type == 'review':
243
+ return f"{presenter_desc} examines different aspects of the product, pointing out features and sharing their opinions. They may be holding, using, or demonstrating the item while speaking to the camera."
244
+
245
+ elif video_type == 'vlog':
246
+ return f"{presenter_desc} continues sharing their experience, possibly showing different locations or activities. The scene captures candid moments with natural lighting and casual interactions."
247
+
248
+ elif video_type == 'cooking':
249
+ return f"{presenter_desc} works in the kitchen, preparing ingredients or cooking. They demonstrate techniques while explaining each step, with kitchen tools and ingredients visible on the counter."
250
+
251
+ elif video_type == 'fitness':
252
+ return f"{presenter_desc} demonstrates exercise movements, likely in workout attire in a gym or home setting. They show proper form while providing instruction and motivation."
253
+
254
+ else:
255
+ return f"{presenter_desc} continues with the main content, engaging with the audience through clear explanations and demonstrations. The setting remains consistent with good lighting and clear audio."
256
+
257
  def detect_video_type(self, title, description):
258
  """Detect video type based on title and description"""
259
  text = (title + " " + description).lower()
 
675
  inputs=[api_key_input],
676
  outputs=[api_status, main_interface]
677
  )
 
678
  # Always show interface option (for fallback mode)
679
  with gr.Row():
680
  show_interface_btn = gr.Button("🚀 Use Without Gemini API (Fallback Mode)", variant="secondary")