Spaces:

ali-kanbar
/

Animation-To-Script

Running

App Files Files Community

ali-kanbar commited on Apr 26

Commit

bba1aed

verified ·

1 Parent(s): ca4d93a

Update app.py

Browse files

Files changed (1) hide show

app.py +535 -239

app.py CHANGED Viewed

@@ -1,239 +1,535 @@
-import gradio as gr
-import asyncio
-import os
-import traceback
-import numpy as np
-import re
-from functools import partial
-import torch
-import imageio
-import cv2
-from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
-from PIL import Image
-import edge_tts
-from transformers import AutoTokenizer, pipeline
-from moviepy.editor import VideoFileClip, AudioFileClip
-from func_timeout import func_timeout, FunctionTimedOut
-# Initialize models with cache optimization
-def initialize_components():
-    global tokenizer, text_pipe, sentiment_analyzer, pipe
-    # Text generation components
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", cache_dir="model_cache")
-    text_pipe = pipeline(
-        "text-generation",
-        model="Qwen/Qwen2.5-1.5B-Instruct",
-        tokenizer=tokenizer,
-        device_map="auto",
-        cache_dir="model_cache"
-    )
-    # Sentiment analysis
-    sentiment_analyzer = pipeline("sentiment-analysis", cache_dir="model_cache")
-    # Video generation setup
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = torch.float16 if device == "cuda" else torch.float32
-    step = 8
-    repo = "ByteDance/AnimateDiff-Lightning"
-    ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
-    base = "emilianJR/epiCRealism"
-    # Load motion adapter with caching
-    adapter = MotionAdapter().to(device, dtype)
-    model_path = hf_hub_download(repo, ckpt, cache_dir="model_cache")
-    adapter.load_state_dict(load_file(model_path, device=device))
-    # Initialize pipeline
-    pipe = AnimateDiffPipeline.from_pretrained(
-        base,
-        motion_adapter=adapter,
-        torch_dtype=dtype,
-        cache_dir="model_cache"
-    ).to(device)
-    pipe.scheduler = EulerDiscreteScheduler.from_config(
-        pipe.scheduler.config,
-        timestep_spacing="trailing",
-        beta_schedule="linear"
-    )
-initialize_components()
-# Cleanup function for resource management
-def cleanup():
-    torch.cuda.empty_cache()
-    for f in ["generated_video.mp4", "final_video_with_audio.mp4", "output.mp3"]:
-        if os.path.exists(f):
-            try:
-                os.remove(f)
-            except:
-                pass
-# Story generation functions (keep your original functions but add timeout)
-def generate_video(summary):
-    def crossfade_transition(frames1, frames2, transition_length=10):
-        blended_frames = []
-        frames1_np = [np.array(frame) for frame in frames1[-transition_length:]]
-        frames2_np = [np.array(frame) for frame in frames2[:transition_length]]
-        for i in range(transition_length):
-            alpha = i / transition_length
-            beta = 1.0 - alpha
-            blended = cv2.addWeighted(frames1_np[i], beta, frames2_np[i], alpha, 0)
-            blended_frames.append(Image.fromarray(blended))
-        return blended_frames
-    sentences = []
-    current_sentence = ""
-    for char in summary:
-        current_sentence += char
-        if char in {'.', '!', '?'}:
-            sentences.append(current_sentence.strip())
-            current_sentence = ""
-    sentences = [s.strip() for s in sentences if s.strip()]
-    output_dir = "generated_frames"
-    video_path = "generated_video.mp4"
-    os.makedirs(output_dir, exist_ok=True)
-    all_frames = []
-    previous_frames = None
-    transition_frames = 10
-    batch_size = 1
-    for i in range(0, len(sentences), batch_size):
-        batch_prompts = sentences[i : i + batch_size]
-        for idx, prompt in enumerate(batch_prompts):
-            try:
-                output = func_timeout(
-                    300,  # 5 minute timeout per scene
-                    pipe,
-                    args=(prompt,),
-                    kwargs={
-                        'guidance_scale': 1.0,
-                        'num_inference_steps': step,
-                        'width': 128,  # Reduced resolution
-                        'height': 128
-                    }
-                )
-                frames = output.frames[0]
-                if previous_frames is not None:
-                    transition = crossfade_transition(previous_frames, frames, transition_frames)
-                    all_frames.extend(transition)
-                all_frames.extend(frames)
-                previous_frames = frames
-            except FunctionTimedOut:
-                print(f"Timeout generating scene {i+idx+1}")
-                return None
-            except Exception as e:
-                print(f"Error generating scene: {str(e)}")
-                continue
-    imageio.mimsave(video_path, all_frames, fps=6)  # Reduced FPS
-    return video_path
-# Modified main processing function with enhanced error handling
-def create_story_video(prompt, progress=gr.Progress()):
-    cleanup()  # Clear previous runs
-    if not prompt or len(prompt.strip()) < 5:
-        return "Prompt too short (min 5 characters)", None, None
-    if len(prompt) > 500:
-        return "Prompt too long (max 500 characters)", None, None
-    try:
-        progress(0, desc="Starting story generation...")
-        story = generate_story(prompt)
-        progress(25, desc="Story generated")
-        progress(30, desc="Starting video generation...")
-        video_path = generate_video(story)
-        if not video_path:
-            return story, None, "Video generation failed"
-        progress(60, desc="Video rendered")
-        progress(65, desc="Creating audio summary...")
-        audio_summary = summary_of_summary(story, video_path)
-        progress(75, desc="Generating voiceover...")
-        try:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-            audio_file = loop.run_until_complete(
-                generate_audio_with_sentiment(audio_summary, sentiment_analyzer)
-            )
-        except Exception as e:
-            return story, None, f"Audio error: {str(e)}"
-        progress(90, desc="Finalizing video...")
-        output_path = 'final_video_with_audio.mp4'
-        combine_video_with_audio(video_path, audio_file, output_path)
-        return story, output_path, audio_summary
-    except Exception as e:
-        error_msg = f"Error: {str(e)}"
-        print(traceback.format_exc())
-        return error_msg, None, None
-# Keep other functions (summarize, generate_story, etc.) unchanged from your original code
-# ...
-# Gradio interface setup with resource management
-EXAMPLE_PROMPTS = [
-    "A nurse discovers an unusual pattern in patient symptoms.",
-    "A family finds a time capsule during home renovation.",
-    "A restaurant owner innovates to save their business.",
-    "Wildlife tracking reveals climate changes.",
-    "Community rebuilds after natural disaster."
-]
-with gr.Blocks(title="AI Story Generator", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🎬 AI Story Video Generator")
-    gr.Markdown("Enter a short story idea (5-500 characters)")
-    with gr.Row():
-        prompt_input = gr.Textbox(
-            label="Story Idea",
-            placeholder="Example: A detective finds a hidden room...",
-            max_lines=2
-        )
-    gr.Examples(
-        examples=EXAMPLE_PROMPTS,
-        inputs=prompt_input,
-        label="Example Prompts"
-    )
-    with gr.Row():
-        generate_btn = gr.Button("Generate", variant="primary")
-        clear_btn = gr.Button("Clear", variant="secondary")
-    with gr.Tabs():
-        with gr.Tab("Results"):
-            video_output = gr.Video(label="Generated Video", interactive=False)
-            story_output = gr.Textbox(label="Full Story", lines=10)
-            audio_summary = gr.Textbox(label="Audio Summary", lines=3)
-    generate_btn.click(
-        fn=create_story_video,
-        inputs=prompt_input,
-        outputs=[story_output, video_output, audio_summary]
-    )
-    clear_btn.click(
-        fn=lambda: [None, None, None],
-        outputs=[story_output, video_output, audio_summary]
-    )
-    demo.load(fn=cleanup)
-    demo.unload(fn=cleanup)
-if __name__ == "__main__":
-    demo.launch(server_port=7860, show_error=True)

+import gradio as gr
+import asyncio
+import os
+import traceback
+import numpy as np
+import re
+from functools import partial
+# Import all required libraries
+import torch
+import imageio
+import cv2
+from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from PIL import Image
+import edge_tts
+from transformers import AutoTokenizer, pipeline
+from moviepy.editor import VideoFileClip, AudioFileClip
+# Initialize the Qwen model
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+text_pipe = pipeline(
+    "text-generation",
+    model="Qwen/Qwen2.5-1.5B-Instruct",
+    tokenizer=tokenizer
+)
+# Initialize the sentiment analyzer
+sentiment_analyzer = pipeline("sentiment-analysis")
+# Initialize video generation components
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+step = 8
+repo = "ByteDance/AnimateDiff-Lightning"
+ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
+base = "emilianJR/epiCRealism"
+# Load motion adapter
+adapter = MotionAdapter().to(device, dtype)
+adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
+# Load pipeline
+pipe = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear")
+# Define all required functions
+def summarize(text):
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are an expert summarizer focused on efficiency and clarity. "
+                "Create concise narrative summaries that: "
+                "1. Capture all key points and main ideas "
+                "2. Omit examples, repetitions, and secondary details "
+                "3. Maintain logical flow and coherence "
+                "4. Use clear, direct language without markdown formatting"
+            )
+        },
+        {
+            "role": "user",
+            "content": (
+                "Please summarize the following text in 10-15 sentences. "
+                "Focus on essential information, exclude non-critical details, "
+                f"and maintain natural storytelling flow:\n\n{text}"
+            )
+        }
+    ]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    response = text_pipe(
+        prompt,
+        max_new_tokens=512,
+        num_beams=4,
+        early_stopping=True,
+        no_repeat_ngram_size=3,
+        temperature=0.7,
+        top_p=0.95,
+        do_sample=True
+    )
+    result = response[0]['generated_text']
+    summary = result.split("assistant\n")[-1].strip()
+    return summary
+def generate_story(prompt):
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are a skilled storyteller specializing in tight, impactful narratives. "
+                "Create engaging stories that:\n"
+                "1. Contain exactly 15-20 sentences\n"
+                "2. Keep each sentence under 77 tokens\n"
+                "3. Maintain strong narrative flow and pacing\n"
+                "4. Focus on vivid imagery and concrete details\n"
+                "5. Avoid filler words and redundant phrases\n"
+                "6. Use simple, direct language without markdown"
+            )
+        },
+        {
+            "role": "user",
+            "content": (
+                f"Craft a compelling short story based on this premise: {prompt}\n"
+                "Structure requirements:\n"
+                "- Strict 15-20 sentence count\n"
+                "- Maximum 77 tokens per sentence\n"
+                "- Clear beginning-middle-end structure\n"
+                "- Emphasis on showing rather than telling\n"
+                "Output plain text only, no markdown formatting."
+            )
+        }
+    ]
+    chat_prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # First attempt to generate story
+    generated = text_pipe(
+        chat_prompt,
+        max_new_tokens=1024,
+        num_beams=5,
+        early_stopping=True,
+        no_repeat_ngram_size=4,
+        temperature=0.65,
+        top_k=30,
+        top_p=0.90,
+        do_sample=True,
+        length_penalty=0.9
+    )
+    full_output = generated[0]['generated_text']
+    story = full_output.split("assistant\n")[-1].strip()
+    # Process sentences and check constraints
+    sentences = []
+    for s in story.split('.'):
+        if s.strip():
+            sentences.append(s.strip())
+    # Check sentence count constraint
+    sentence_count = len(sentences)
+    if sentence_count < 15 or sentence_count > 20:
+        # Regenerate with stricter parameters if constraints not met
+        enhanced_prompt = f"{prompt} (IMPORTANT: Story MUST have EXACTLY 15-20 sentences, and each sentence MUST be under 77 tokens. Current attempt had {sentence_count} sentences.)"
+        messages[1]["content"] = (
+            f"Craft a compelling short story based on this premise: {enhanced_prompt}\n"
+            "Structure requirements:\n"
+            "- CRITICAL: Output EXACTLY 15-20 sentences, not more, not less\n"
+            "- CRITICAL: Maximum 77 tokens per sentence\n"
+            "- Clear beginning-middle-end structure\n"
+            "- Emphasis on showing rather than telling\n"
+            "Output plain text only, no markdown formatting."
+        )
+        chat_prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Try with more strict parameters
+        generated = text_pipe(
+            chat_prompt,
+            max_new_tokens=1024,
+            num_beams=7,
+            early_stopping=True,
+            no_repeat_ngram_size=4,
+            temperature=0.5,
+            top_k=20,
+            top_p=0.85,
+            do_sample=True,
+            length_penalty=1.0
+        )
+        full_output = generated[0]['generated_text']
+        story = full_output.split("assistant\n")[-1].strip()
+        sentences = []
+        for s in story.split('.'):
+            if s.strip():
+                sentences.append(s.strip())
+    word_to_token_ratio = 1.3
+    constrained_sentences = []
+    for sentence in sentences:
+        words = sentence.split()
+        estimated_tokens = len(words) * word_to_token_ratio
+        if estimated_tokens > 77:
+            max_words = int(75 / word_to_token_ratio)
+            truncated = ' '.join(words[:max_words])
+            constrained_sentences.append(truncated)
+        else:
+            constrained_sentences.append(sentence)
+    while len(constrained_sentences) < 15:
+        constrained_sentences.append("The story continued with unexpected twists and turns.")
+    constrained_sentences = constrained_sentences[:20]
+    formatted_sentences = []
+    for s in constrained_sentences:
+        if not s.endswith(('.', '!', '?')):
+            s += '.'
+        formatted_sentences.append(s)
+    final_story = '\n'.join(formatted_sentences)
+    return final_story
+def generate_video(summary):
+    def crossfade_transition(frames1, frames2, transition_length=10):
+        blended_frames = []
+        frames1_np = [np.array(frame) for frame in frames1[-transition_length:]]
+        frames2_np = [np.array(frame) for frame in frames2[:transition_length]]
+        for i in range(transition_length):
+            alpha = i / transition_length
+            beta = 1.0 - alpha
+            blended = cv2.addWeighted(frames1_np[i], beta, frames2_np[i], alpha, 0)
+            blended_frames.append(Image.fromarray(blended))
+        return blended_frames
+    # Sentence splitting
+    sentences = []
+    current_sentence = ""
+    for char in summary:
+        current_sentence += char
+        if char in {'.', '!', '?'}:
+            sentences.append(current_sentence.strip())
+            current_sentence = ""
+    sentences = [s.strip() for s in sentences if s.strip()]
+    print(f"Total scenes: {len(sentences)}")
+    # Output config
+    output_dir = "generated_frames"
+    video_path = "generated_video.mp4"
+    os.makedirs(output_dir, exist_ok=True)
+    # Generate animation
+    all_frames = []
+    previous_frames = None
+    transition_frames = 10
+    batch_size = 1
+    for i in range(0, len(sentences), batch_size):
+        batch_prompts = sentences[i : i + batch_size]
+        for idx, prompt in enumerate(batch_prompts):
+            print(f"Generating animation for prompt {i+idx+1}/{len(sentences)}: {prompt}")
+            output = pipe(
+                prompt=prompt,
+                guidance_scale=1.0,
+                num_inference_steps=step,
+                width=256,
+                height=256,
+            )
+            frames = output.frames[0]
+            if previous_frames is not None:
+                transition = crossfade_transition(previous_frames, frames, transition_frames)
+                all_frames.extend(transition)
+            all_frames.extend(frames)
+            previous_frames = frames
+    # Save video
+    imageio.mimsave(video_path, all_frames, fps=8)
+    print(f"Video saved at {video_path}")
+    return video_path
+def estimate_voiceover_words(video_path):
+    try:
+        # Get video duration in seconds
+        video = VideoFileClip(video_path)
+        duration_minutes = video.duration / 60
+        # Estimate word count based on average speaking rate (150 words per minute)
+        estimated_words = int(duration_minutes * 150)
+        # Ensure a minimum word count
+        return max(estimated_words, 30)
+    except Exception as e:
+        print(f"Error estimating voiceover words: {str(e)}")
+        return 50  # Default fallback
+def summary_of_summary(text, video_path):
+    target_word_count = estimate_voiceover_words(video_path)
+    messages_2 = [
+        {
+            "role": "system",
+            "content": (
+                "You are an expert summarizer focused on brevity and clarity. "
+                f"Create a summary that is exactly around {target_word_count} words: "
+                "1. Capture the most essential information\n"
+                "2. Omit unnecessary details and examples\n"
+                "3. Maintain logical flow and coherence\n"
+                "4. Use clear, direct language"
+            )
+        },
+        {
+            "role": "user",
+            "content": (
+                f"Please summarize the following text in approximately {target_word_count} words:\n\n{text}"
+            )
+        }
+    ]
+    # Generate prompt
+    prompt_for_resummarization = tokenizer.apply_chat_template(
+        messages_2,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # Generate response
+    response = text_pipe(
+        prompt_for_resummarization,
+        max_new_tokens=target_word_count + 20,
+        num_beams=4,
+        early_stopping=True,
+        no_repeat_ngram_size=3,
+        temperature=0.7,
+        top_p=0.95,
+        do_sample=True
+    )
+    # Extract result
+    summary = response[0]['generated_text'].split("assistant\n")[-1].strip()
+    return summary
+async def generate_audio_with_sentiment(text, sentiment_analyzer):
+    # Perform sentiment analysis on the text
+    sentiment = sentiment_analyzer(text)[0]
+    label = sentiment['label']
+    confidence = sentiment['score']
+    print(f"Sentiment: {label} with confidence {confidence:.2f}")
+    # Set voice parameters based on sentiment
+    if label == "POSITIVE":
+        voice = "en-US-AriaNeural"  # Cheerful and energetic tone for positive sentiment
+        rate = "1.2"  # Faster speech
+        pitch = "+2Hz"  # Slightly higher pitch for a more positive tone
+    else:
+        voice = "en-US-GuyNeural"  # Neutral tone for negative sentiment
+        rate = "0.9"  # Slower speech
+        pitch = "-2Hz"  # Lower pitch for a more somber tone
+    # Generate speech with EdgeTTS
+    communicate = edge_tts.Communicate(text, voice)
+    # Save the audio to a file
+    await communicate.save("output.mp3")
+    # Play the generated audio
+    return "output.mp3"
+def combine_video_with_audio(video_path, audio_path, output_path):
+    # Load video and audio
+    video = VideoFileClip(video_path)
+    audio = AudioFileClip(audio_path)
+    # Set the audio to the video
+    video = video.set_audio(audio)
+    # Save the final video
+    video.write_videofile(output_path, codec='libx264', audio_codec='aac')
+    print("Video with audio saved successfully!")
+# Main processing function
+def create_story_video(prompt, progress=gr.Progress()):
+    # Input validation
+    if not prompt or len(prompt.strip()) < 5:
+        return "Please enter a longer prompt (at least 5 characters).", None, None
+    try:
+        # Step 1: Generate story
+        progress(0, desc="Starting story generation...")
+        story = generate_story(prompt)
+        progress(20, desc="Story generated successfully!")
+        # Step 2: Generate video
+        progress(25, desc="Creating video animation (this may take several minutes)...")
+        video_path = generate_video(story)
+        progress(60, desc="Video created successfully!")
+        # Step 3: Create audio summary
+        progress(65, desc="Creating audio summary...")
+        audio_summary = summary_of_summary(story, video_path)
+        progress(80, desc="Creating audio narration...")
+        # Step 4: Generate audio with sentiment (async)
+        try:
+            # Set up event loop handling
+            try:
+                loop = asyncio.get_event_loop()
+            except RuntimeError:
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+            audio_file = loop.run_until_complete(
+                generate_audio_with_sentiment(audio_summary, sentiment_analyzer)
+            )
+            progress(90, desc="Audio created successfully!")
+        except Exception as e:
+            print(f"Audio generation error: {str(e)}")
+            return story, None, f"Audio generation failed: {str(e)}"
+        # Step 5: Combine video and audio
+        progress(95, desc="Combining video and audio...")
+        output_path = 'final_video_with_audio.mp4'
+        combine_video_with_audio(video_path, audio_file, output_path)
+        progress(100, desc="Process complete!")
+        return story, output_path, audio_summary
+    except Exception as e:
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return f"An error occurred: {str(e)}", None, None
+# Sample prompt examples based on realistic scenarios
+EXAMPLE_PROMPTS = [
+    "A nurse discovers an unusual pattern in patient symptoms that leads to an important medical breakthrough.",
+    "During a home renovation, a family uncovers a time capsule from the previous owners.",
+    "A struggling local restaurant owner finds an innovative way to save their business during an economic downturn.",
+    "An environmental scientist tracks mysterious wildlife behavior that reveals concerning climate changes.",
+    "A community comes together to rebuild after a devastating natural disaster.",
+    "A teacher develops a unique method that transforms learning for students with special needs.",
+    "An elderly person reconnects with a childhood friend through social media after sixty years apart.",
+    "A food delivery driver forms an unexpected friendship with an isolated elderly customer during the pandemic.",
+    "A first-generation college student overcomes significant obstacles to achieve academic success.",
+    "A wildlife photographer documents the surprising recovery of an endangered species."
+]
+# Create the Gradio interface
+with gr.Blocks(title="Animind AI Story Video Generator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎬 AI Story Video Generator")
+    gr.Markdown("Enter a one-sentence prompt to generate a complete story with video and narration.")
+    # Input section
+    with gr.Row():
+        prompt_input = gr.Textbox(
+            label="Your Story Idea",
+            placeholder="Enter a one-sentence prompt (e.g., 'A detective discovers a hidden room in an abandoned mansion')",
+            lines=2
+        )
+    # Example prompts section
+    gr.Markdown("### Try these example prompts:")
+    # Create examples using Gradio's examples feature
+    with gr.Row():
+        examples = gr.Examples(
+            examples=[[prompt] for prompt in EXAMPLE_PROMPTS],
+            inputs=prompt_input,
+            label="Click any example to load it"
+        )
+    with gr.Row():
+        generate_button = gr.Button("Generate Story Video", variant="primary")
+        clear_button = gr.Button("Clear", variant="secondary")
+    # Status indicator
+    status_indicator = gr.Markdown("Ready to generate your story video...")
+    # Output section with tabs
+    with gr.Tabs():
+        with gr.TabItem("Results"):
+            with gr.Row():
+                with gr.Column(scale=2):
+                    video_output = gr.Video(label="Generated Video with Narration")
+                with gr.Column(scale=1):
+                    story_output = gr.TextArea(label="Generated Story", lines=15, max_lines=30)
+                    summary_output = gr.TextArea(label="Audio Summary", lines=5)
+        with gr.TabItem("Help & Information"):
+            gr.Markdown("""
+            ## How to use this tool
+            1. Enter a creative one-sentence story idea in the input box
+            2. Click "Generate Story Video" and wait for processing to complete
+            3. View your complete AI-generated story video with narration
+            ## Processing Steps
+            1. **Story Generation**: The AI expands your idea into a 15-20 sentence story
+            2. **Video Creation**: Each sentence is visualized through AI-generated animation
+            3. **Audio Narration**: The AI analyzes the sentiment and creates appropriate voiceover
+            4. **Final Compilation**: Video and audio are combined into your final story
+            ## Tips for Great Results
+            - Use clear, specific prompts that suggest a narrative arc
+            - Include interesting characters, settings, or situations
+            - Make your prompt realistic but with potential for development
+            - Try to suggest a potential conflict or discovery
+            ## Troubleshooting
+            If you encounter errors:
+            - Try a different prompt
+            - Ensure your prompt is clear and specific
+            - Check that all required models are properly loaded
+            """)
+    # Handle clearing
+    def clear_outputs():
+        return "", None, ""
+    # Connect interface elements
+    generate_button.click(
+        fn=create_story_video,
+        inputs=prompt_input,
+        outputs=[story_output, video_output, summary_output],
+        api_name="generate"
+    )
+    clear_button.click(
+        fn=clear_outputs,
+        inputs=None,
+        outputs=[story_output, video_output, summary_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()