Spaces:

developer28
/

Youtubedownloader

Sleeping

App Files Files Community

developer28 commited on Jun 18

Commit

95cc944

verified ·

1 Parent(s): 2c78469

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -32

app.py CHANGED Viewed

@@ -56,31 +56,51 @@ class YouTubeDownloader:
         try:
             duration = video_info.get('duration', 0)
             title = video_info.get('title', '')
-            description = video_info.get('description', '')[:1000]  # Limit description length
             if not duration:
                 return ["**[Duration Unknown]**: Unable to generate timestamped breakdown - video duration not available"]
-            # Create prompt for Gemini
             prompt = f"""
-            Analyze this YouTube video and create a detailed scene-by-scene breakdown with timestamps:
             Title: {title}
             Duration: {duration} seconds
             Description: {description}
-            Please provide a scene breakdown with the following format:
-            - Divide the video into logical segments based on typical content flow
-            - For videos under 2 minutes: 10-15 second segments
-            - For videos 2-10 minutes: 30-45 second segments
-            - For videos over 10 minutes: 60-90 second segments
-            - Maximum 15 scenes total
-            For each scene, provide:
-            **[START_TIME-END_TIME]**: Detailed description of what likely happens in this segment, including visual elements, audio cues, potential dialogue or narration, and scene transitions.
-            Consider the video type (tutorial, music video, vlog, etc.) and provide contextually appropriate descriptions.
-            Format timestamps as MM:SS.
             """
             response = self.gemini_model.generate_content(prompt)
@@ -89,11 +109,28 @@ class YouTubeDownloader:
             if response and response.text:
                 scenes = []
                 lines = response.text.split('\n')
                 for line in lines:
                     line = line.strip()
-                    if line and ('**[' in line or line.startswith('*')):
-                        scenes.append(line)
                 return scenes if scenes else self.generate_scene_breakdown_fallback(video_info)
             else:
                 return self.generate_scene_breakdown_fallback(video_info)
@@ -103,23 +140,30 @@ class YouTubeDownloader:
             return self.generate_scene_breakdown_fallback(video_info)
     def generate_scene_breakdown_fallback(self, video_info):
-        """Fallback scene generation when Gemini is not available"""
         duration = video_info.get('duration', 0)
         title = video_info.get('title', '').lower()
         if not duration:
             return ["**[Duration Unknown]**: Unable to generate timestamped breakdown"]
-        # Simple fallback logic
-        if duration <= 120:
-            segment_length = 15
-        elif duration <= 600:
-            segment_length = 45
         else:
-            segment_length = 90
         scenes = []
-        num_segments = min(duration // segment_length + 1, 15)
         for i in range(num_segments):
             start_time = i * segment_length
@@ -128,17 +172,88 @@ class YouTubeDownloader:
             start_formatted = f"{start_time//60}:{start_time%60:02d}"
             end_formatted = f"{end_time//60}:{end_time%60:02d}"
-            if i == 0:
-                desc = "Opening sequence with introduction and setup"
-            elif i == num_segments - 1:
-                desc = "Conclusion with final thoughts and call-to-action"
-            else:
-                desc = f"Main content segment {i} with key information and details"
             scenes.append(f"**[{start_formatted}-{end_formatted}]**: {desc}")
         return scenes
     def detect_video_type(self, title, description):
         """Detect video type based on title and description"""
         text = (title + " " + description).lower()
@@ -560,7 +675,6 @@ def create_interface():
             inputs=[api_key_input],
             outputs=[api_status, main_interface]
         )
         # Always show interface option (for fallback mode)
         with gr.Row():
             show_interface_btn = gr.Button("🚀 Use Without Gemini API (Fallback Mode)", variant="secondary")

         try:
             duration = video_info.get('duration', 0)
             title = video_info.get('title', '')
+            description = video_info.get('description', '')[:1500]  # Increased limit for better context
             if not duration:
                 return ["**[Duration Unknown]**: Unable to generate timestamped breakdown - video duration not available"]
+            # Create enhanced prompt for Gemini
             prompt = f"""
+            Analyze this YouTube video and create a highly detailed, scene-by-scene breakdown with precise timestamps and specific descriptions:
             Title: {title}
             Duration: {duration} seconds
             Description: {description}
+            IMPORTANT INSTRUCTIONS:
+            1. Create detailed scene descriptions that include:
+               - Physical appearance of people (age, gender, clothing, hair, etc.)
+               - Exact actions being performed
+               - Dialogue or speech (if mentioned in title/description)
+               - Setting and environment details
+               - Props, objects, or products being shown
+               - Visual effects, text overlays, or graphics
+               - Mood, tone, and atmosphere
+               - Camera movements or angles (if apparent)
+            2. Timestamp Guidelines:
+               - For videos under 1 minute: 2-3 second segments
+               - For videos 1-5 minutes: 3-5 second segments
+               - For videos 5-15 minutes: 5-10 second segments
+               - For videos over 15 minutes: 10-15 second segments
+               - Maximum 20 scenes total for longer videos
+            3. Format each scene EXACTLY like this:
+               **[MM:SS-MM:SS]**: Detailed description including who is visible, what they're wearing, what they're doing, what they're saying (if applicable), setting details, objects shown, and any visual elements.
+            4. Be specific about:
+               - Character descriptions (appearance, clothing, expressions)
+               - Actions and movements
+               - Objects, products, or props being displayed
+               - Setting and background details
+               - Any text, graphics, or overlays
+               - Transitions between scenes
+            5. Write descriptions as if you're watching the video in real-time, noting everything visible and audible.
+            Based on the title and description, intelligently infer what would likely happen in each time segment. Consider the video type and create contextually appropriate, detailed descriptions.
             """
             response = self.gemini_model.generate_content(prompt)
             if response and response.text:
                 scenes = []
                 lines = response.text.split('\n')
+                current_scene = ""
                 for line in lines:
                     line = line.strip()
+                    if line.startswith('**[') and ']:' in line:
+                        # This is a new scene timestamp line
+                        if current_scene:
+                            scenes.append(current_scene.strip())
+                        current_scene = line
+                    elif current_scene and line:
+                        # This is continuation of the current scene description
+                        current_scene += " " + line
+                    elif line.startswith('*') and '[' in line:
+                        # Alternative format handling
+                        if current_scene:
+                            scenes.append(current_scene.strip())
+                        current_scene = line
+                # Add the last scene if exists
+                if current_scene:
+                    scenes.append(current_scene.strip())
                 return scenes if scenes else self.generate_scene_breakdown_fallback(video_info)
             else:
                 return self.generate_scene_breakdown_fallback(video_info)
             return self.generate_scene_breakdown_fallback(video_info)
     def generate_scene_breakdown_fallback(self, video_info):
+        """Enhanced fallback scene generation when Gemini is not available"""
         duration = video_info.get('duration', 0)
         title = video_info.get('title', '').lower()
+        description = video_info.get('description', '').lower()
+        uploader = video_info.get('uploader', 'Content creator')
         if not duration:
             return ["**[Duration Unknown]**: Unable to generate timestamped breakdown"]
+        # Determine segment length based on duration
+        if duration <= 60:
+            segment_length = 3
+        elif duration <= 300:
+            segment_length = 5
+        elif duration <= 900:
+            segment_length = 10
         else:
+            segment_length = 15
         scenes = []
+        num_segments = min(duration // segment_length + 1, 20)
+        # Detect video type for better descriptions
+        video_type = self.detect_video_type_detailed(title, description)
         for i in range(num_segments):
             start_time = i * segment_length
             start_formatted = f"{start_time//60}:{start_time%60:02d}"
             end_formatted = f"{end_time//60}:{end_time%60:02d}"
+            # Generate contextual descriptions based on video type and timing
+            desc = self.generate_contextual_description(i, num_segments, video_type, uploader, title)
             scenes.append(f"**[{start_formatted}-{end_formatted}]**: {desc}")
         return scenes
+    def detect_video_type_detailed(self, title, description):
+        """Detect video type with more detail for better fallback descriptions"""
+        text = (title + " " + description).lower()
+        if any(word in text for word in ['tutorial', 'how to', 'guide', 'learn', 'diy', 'step by step']):
+            return 'tutorial'
+        elif any(word in text for word in ['review', 'unboxing', 'test', 'comparison', 'vs']):
+            return 'review'
+        elif any(word in text for word in ['vlog', 'daily', 'routine', 'day in', 'morning', 'skincare']):
+            return 'vlog'
+        elif any(word in text for word in ['music', 'song', 'cover', 'lyrics', 'dance']):
+            return 'music'
+        elif any(word in text for word in ['comedy', 'funny', 'prank', 'challenge', 'reaction']):
+            return 'entertainment'
+        elif any(word in text for word in ['news', 'breaking', 'update', 'report']):
+            return 'news'
+        elif any(word in text for word in ['cooking', 'recipe', 'food', 'kitchen']):
+            return 'cooking'
+        elif any(word in text for word in ['workout', 'fitness', 'exercise', 'yoga']):
+            return 'fitness'
+        else:
+            return 'general'
+    def generate_contextual_description(self, scene_index, total_scenes, video_type, uploader, title):
+        """Generate contextual descriptions based on video type and scene position"""
+        # Common elements
+        presenter_desc = f"The content creator"
+        if 'woman' in title.lower() or 'girl' in title.lower():
+            presenter_desc = "A woman"
+        elif 'man' in title.lower() or 'guy' in title.lower():
+            presenter_desc = "A man"
+        # Position-based descriptions
+        if scene_index == 0:
+            # Opening scene
+            if video_type == 'tutorial':
+                return f"{presenter_desc} appears on screen, likely introducing themselves and the topic. They may be in a well-lit indoor setting, wearing casual clothing, and addressing the camera directly with a welcoming gesture."
+            elif video_type == 'vlog':
+                return f"{presenter_desc} greets the camera with a smile, possibly waving. They appear to be in their usual filming location, wearing their typical style, and beginning their introduction to today's content."
+            elif video_type == 'review':
+                return f"{presenter_desc} introduces the product or topic they'll be reviewing, likely holding or displaying the item. The setting appears organized, possibly with the product prominently featured."
+            else:
+                return f"{presenter_desc} appears on screen to begin the video, introducing the topic with engaging body language and clear speech directed at the audience."
+        elif scene_index == total_scenes - 1:
+            # Closing scene
+            if video_type == 'tutorial':
+                return f"{presenter_desc} concludes the tutorial, possibly showing the final result. They may be thanking viewers, asking for engagement (likes/comments), and suggesting related content."
+            elif video_type == 'vlog':
+                return f"{presenter_desc} wraps up their vlog, possibly reflecting on the day's events. They appear relaxed and are likely saying goodbye to viewers with a friendly gesture."
+            else:
+                return f"{presenter_desc} concludes the video with final thoughts, thanking viewers for watching, and encouraging engagement through likes, comments, and subscriptions."
+        else:
+            # Middle scenes - content-specific
+            if video_type == 'tutorial':
+                step_num = scene_index
+                return f"{presenter_desc} demonstrates step {step_num} of the process, showing specific techniques and explaining the procedure. They may be using tools or materials, with close-up shots of their hands working."
+            elif video_type == 'review':
+                return f"{presenter_desc} examines different aspects of the product, pointing out features and sharing their opinions. They may be holding, using, or demonstrating the item while speaking to the camera."
+            elif video_type == 'vlog':
+                return f"{presenter_desc} continues sharing their experience, possibly showing different locations or activities. The scene captures candid moments with natural lighting and casual interactions."
+            elif video_type == 'cooking':
+                return f"{presenter_desc} works in the kitchen, preparing ingredients or cooking. They demonstrate techniques while explaining each step, with kitchen tools and ingredients visible on the counter."
+            elif video_type == 'fitness':
+                return f"{presenter_desc} demonstrates exercise movements, likely in workout attire in a gym or home setting. They show proper form while providing instruction and motivation."
+            else:
+                return f"{presenter_desc} continues with the main content, engaging with the audience through clear explanations and demonstrations. The setting remains consistent with good lighting and clear audio."
     def detect_video_type(self, title, description):
         """Detect video type based on title and description"""
         text = (title + " " + description).lower()
             inputs=[api_key_input],
             outputs=[api_status, main_interface]
         )
         # Always show interface option (for fallback mode)
         with gr.Row():
             show_interface_btn = gr.Button("🚀 Use Without Gemini API (Fallback Mode)", variant="secondary")