Spaces:

openfree
/

ginigen-sora

Running

App Files Files Community

openfree commited on Nov 23, 2024

Commit

cb2582c

verified ·

1 Parent(s): a040b19

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -82

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from gradio_toggle import Toggle
 import torch
 from huggingface_hub import snapshot_download
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.transformers.transformer3d import Transformer3DModel
@@ -20,11 +21,33 @@ import tempfile
 import os
 import gc
 from openai import OpenAI
 # Load Hugging Face token if needed
 hf_token = os.getenv("HF_TOKEN")
 openai_api_key = os.getenv("OPENAI_API_KEY")
 client = OpenAI(api_key=openai_api_key)
 system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
 system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
 with open(system_prompt_t2v_path, "r") as f:
@@ -47,7 +70,6 @@ scheduler_dir = Path(model_path) / "scheduler"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
@@ -58,7 +80,6 @@ def load_vae(vae_dir):
     vae.load_state_dict(vae_state_dict)
     return vae.to(device=device, dtype=torch.bfloat16)
 def load_unet(unet_dir):
     unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
@@ -68,13 +89,11 @@ def load_unet(unet_dir):
     transformer.load_state_dict(unet_state_dict, strict=True)
     return transformer.to(device=device, dtype=torch.bfloat16)
 def load_scheduler(scheduler_dir):
     scheduler_config_path = scheduler_dir / "scheduler_config.json"
     scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
     return RectifiedFlowScheduler.from_config(scheduler_config)
 # Helper function for image processing
 def center_crop_and_resize(frame, target_height, target_width):
     h, w, _ = frame.shape
@@ -91,7 +110,6 @@ def center_crop_and_resize(frame, target_height, target_width):
     frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
     return frame_resized
 def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
     image = Image.open(image_path).convert("RGB")
     image_np = np.array(image)
@@ -100,7 +118,6 @@ def load_image_to_tensor_with_resize(image_path, target_height=512, target_width
     frame_tensor = (frame_tensor / 127.5) - 1.0
     return frame_tensor.unsqueeze(0).unsqueeze(2)
 def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
     if not enhance_toggle:
         print("Enhance toggle is off, Prompt: ", prompt)
@@ -114,7 +131,7 @@ def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
     try:
         response = client.chat.completions.create(
-            model="gpt-4o-mini",
             messages=messages,
             max_tokens=200,
         )
@@ -124,7 +141,6 @@ def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
         print(f"Error: {e}")
         return prompt
 # Preset options for resolution and frame configuration
 preset_options = [
     {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
@@ -156,8 +172,6 @@ preset_options = [
     {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
 ]
-# Function to toggle visibility of sliders based on preset selection
 def preset_changed(preset):
     if preset != "Custom":
         selected = next(item for item in preset_options if item["label"] == preset)
@@ -179,7 +193,6 @@ def preset_changed(preset):
             gr.update(visible=True),
         )
 # Load models
 vae = load_vae(vae_dir)
 unet = load_unet(unet_dir)
@@ -201,7 +214,6 @@ pipeline = XoraVideoPipeline(
     vae=vae,
 ).to(device)
 def generate_video_from_text(
     prompt="",
     enhance_prompt_toggle=False,
@@ -217,11 +229,16 @@ def generate_video_from_text(
 ):
     if len(prompt.strip()) < 50:
         raise gr.Error(
-            "Prompt must be at least 50 characters long. Please provide more details for the best results.",
             duration=5,
         )
-    prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
     sample = {
         "prompt": prompt,
@@ -257,7 +274,7 @@ def generate_video_from_text(
             ).images
     except Exception as e:
         raise gr.Error(
-            f"An error occurred while generating the video. Please try again. Error: {e}",
             duration=5,
         )
     finally:
@@ -275,13 +292,13 @@ def generate_video_from_text(
     for frame in video_np[..., ::-1]:
         out.write(frame)
     out.release()
-    # Explicitly delete tensors and clear cache
     del images
     del video_np
     torch.cuda.empty_cache()
     return output_path
 def generate_video_from_image(
     image_path,
     prompt="",
@@ -296,25 +313,29 @@ def generate_video_from_image(
     num_frames=121,
     progress=gr.Progress(),
 ):
     print("Height: ", height)
     print("Width: ", width)
     print("Num Frames: ", num_frames)
     if len(prompt.strip()) < 50:
         raise gr.Error(
-            "Prompt must be at least 50 characters long. Please provide more details for the best results.",
             duration=5,
         )
     if not image_path:
-        raise gr.Error("Please provide an input image.", duration=5)
     media_items = (
         load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
     )
-    prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
     sample = {
         "prompt": prompt,
@@ -361,7 +382,7 @@ def generate_video_from_image(
         out.release()
     except Exception as e:
         raise gr.Error(
-            f"An error occurred while generating the video. Please try again. Error: {e}",
             duration=5,
         )
@@ -371,7 +392,6 @@ def generate_video_from_image(
     return output_path
 def create_advanced_options():
     with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
         seed = gr.Slider(
@@ -418,8 +438,7 @@ def create_advanced_options():
             num_frames_slider,
         ]
-# Define the Gradio interface with tabs
 with gr.Blocks(theme=gr.themes.Soft()) as iface:
     with gr.Row(elem_id="title-row"):
         gr.Markdown(
@@ -430,7 +449,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
         """
         )
     with gr.Row(elem_id="title-row"):
-        gr.HTML(  # add technical report link
             """
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/Lightricks/LTX-Video">
@@ -456,62 +475,63 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
     ):
         gr.Markdown(
             """
-        📝 Prompt Engineering
-        When writing prompts, focus on detailed, chronological descriptions of actions and scenes. Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph. Start directly with the action, and keep descriptions literal and precise. Think like a cinematographer describing a shot list. Keep within 200 words.
-        For best results, build your prompts using this structure:
-        - Start with main action in a single sentence
-        - Add specific details about movements and gestures
-        - Describe character/object appearances precisely
-        - Include background and environment details
-        - Specify camera angles and movements
-        - Describe lighting and colors
-        - Note any changes or sudden events
-        See examples for more inspiration.
-        🎮 Parameter Guide
-        - Resolution Preset: Higher resolutions for detailed scenes, lower for faster generation and simpler scenes
-        - Seed: Save seed values to recreate specific styles or compositions you like
-        - Guidance Scale: 3-3.5 are the recommended values
-        - Inference Steps: More steps (40+) for quality, fewer steps (20-30) for speed
         """
         )
     with gr.Tabs():
         # Text to Video Tab
-        with gr.TabItem("Text to Video"):
             with gr.Row():
                 with gr.Column():
                     txt2vid_prompt = gr.Textbox(
-                        label="Step 1: Enter Your Prompt",
-                        placeholder="Describe the video you want to generate (minimum 50 characters)...",
-                        value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
                     txt2vid_enhance_toggle = Toggle(
-                        label="Enhance Prompt",
                         value=False,
                         interactive=True,
                     )
                     txt2vid_negative_prompt = gr.Textbox(
-                        label="Step 2: Enter Negative Prompt",
-                        placeholder="Describe what you don't want in the video...",
-                        value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                         lines=2,
                     )
                     txt2vid_preset = gr.Dropdown(
                         choices=[p["label"] for p in preset_options],
                         value="768x512, 97 frames",
-                        label="Step 3.1: Choose Resolution Preset",
                     )
                     txt2vid_frame_rate = gr.Slider(
-                        label="Step 3.2: Frame Rate",
                         minimum=21,
                         maximum=30,
                         step=1,
@@ -520,72 +540,72 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                     txt2vid_advanced = create_advanced_options()
                     txt2vid_generate = gr.Button(
-                        "Step 5: Generate Video",
                         variant="primary",
                         size="lg",
                     )
                 with gr.Column():
-                    txt2vid_output = gr.Video(label="Generated Output")
             with gr.Row():
                 gr.Examples(
                     examples=[
                         [
-                            "A young woman in a traditional Mongolian dress is peeking through a sheer white curtain, her face showing a mix of curiosity and apprehension. The woman has long black hair styled in two braids, adorned with white beads, and her eyes are wide with a hint of surprise. Her dress is a vibrant blue with intricate gold embroidery, and she wears a matching headband with a similar design. The background is a simple white curtain, which creates a sense of mystery and intrigue.ith long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair’s face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage",
-                            "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                             "assets/t2v_2.mp4",
                         ],
                         [
-                            "A young man with blond hair wearing a yellow jacket stands in a forest and looks around. He has light skin and his hair is styled with a middle part. He looks to the left and then to the right, his gaze lingering in each direction. The camera angle is low, looking up at the man, and remains stationary throughout the video. The background is slightly out of focus, with green trees and the sun shining brightly behind the man. The lighting is natural and warm, with the sun creating a lens flare that moves across the man’s face. The scene is captured in real-life footage.",
-                            "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                             "assets/t2v_1.mp4",
                         ],
                         [
-                            "A cyclist races along a winding mountain road. Clad in aerodynamic gear, he pedals intensely, sweat glistening on his brow. The camera alternates between close-ups of his determined expression and wide shots of the breathtaking landscape. Pine trees blur past, and the sky is a crisp blue. The scene is invigorating and competitive.",
-                            "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                             "assets/t2v_0.mp4",
                         ],
                     ],
                     inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output],
-                    label="Example Text-to-Video Generations",
                 )
         # Image to Video Tab
-        with gr.TabItem("Image to Video"):
             with gr.Row():
                 with gr.Column():
                     img2vid_image = gr.Image(
                         type="filepath",
-                        label="Step 1: Upload Input Image",
                         elem_id="image_upload",
                     )
                     img2vid_prompt = gr.Textbox(
-                        label="Step 2: Enter Your Prompt",
-                        placeholder="Describe how you want to animate the image (minimum 50 characters)...",
-                        value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
                     img2vid_enhance_toggle = Toggle(
-                        label="Enhance Prompt",
                         value=False,
                         interactive=True,
                     )
                     img2vid_negative_prompt = gr.Textbox(
-                        label="Step 3: Enter Negative Prompt",
-                        placeholder="Describe what you don't want in the video...",
-                        value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                         lines=2,
                     )
                     img2vid_preset = gr.Dropdown(
                         choices=[p["label"] for p in preset_options],
                         value="768x512, 97 frames",
-                        label="Step 3.1: Choose Resolution Preset",
                     )
                     img2vid_frame_rate = gr.Slider(
-                        label="Step 3.2: Frame Rate",
                         minimum=21,
                         maximum=30,
                         step=1,
@@ -594,31 +614,31 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                     img2vid_advanced = create_advanced_options()
                     img2vid_generate = gr.Button(
-                        "Step 6: Generate Video", variant="primary", size="lg"
                     )
                 with gr.Column():
-                    img2vid_output = gr.Video(label="Generated Output")
             with gr.Row():
                 gr.Examples(
                     examples=[
                         [
                             "assets/i2v_i2.png",
-                            "A woman stirs a pot of boiling water on a white electric burner. Her hands, with purple nail polish, hold a wooden spoon and move it in a circular motion within a white pot filled with bubbling water. The pot sits on a white electric burner with black buttons and a digital display. The burner is positioned on a white countertop with a red and white checkered cloth partially visible in the bottom right corner. The camera angle is a direct overhead shot, remaining stationary throughout the scene. The lighting is bright and even, illuminating the scene with a neutral white light. The scene is real-life footage.",
-                            "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                             "assets/i2v_2.mp4",
                         ],
                         [
                             "assets/i2v_i0.png",
-                            "A woman in a long, flowing dress stands in a field, her back to the camera, gazing towards the horizon; her hair is long and light, cascading down her back; she stands beneath the sprawling branches of a large oak tree;  to her left, a classic American car is parked on the dry grass; in the distance, a wrecked car lies on its side; the sky above is a dramatic canvas of bright white clouds against a darker sky; the entire image is in black and white, emphasizing the contrast of light and shadow. The woman is walking slowly towards the car.",
-                            "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                             "assets/i2v_0.mp4",
                         ],
                         [
                             "assets/i2v_i1.png",
-                            "A pair of hands shapes a piece of clay on a pottery wheel, gradually forming a cone shape. The hands, belonging to a person out of frame, are covered in clay and gently press a ball of clay onto the center of a spinning pottery wheel. The hands move in a circular motion, gradually forming a cone shape at the top of the clay. The camera is positioned directly above the pottery wheel, providing a bird’s-eye view of the clay being shaped. The lighting is bright and even, illuminating the clay and the hands working on it. The scene is captured in real-life footage.",
-                            "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
                             "assets/i2v_1.mp4",
                         ],
                     ],
@@ -628,10 +648,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                         img2vid_negative_prompt,
                         img2vid_output,
                     ],
-                    label="Example Image-to-Video Generations",
                 )
-    # [Previous event handlers remain the same]
     txt2vid_preset.change(
         fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
     )
@@ -674,4 +694,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
 if __name__ == "__main__":
     iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
         share=True, show_api=False
-    )

 from gradio_toggle import Toggle
 import torch
 from huggingface_hub import snapshot_download
+from transformers import pipeline
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.transformers.transformer3d import Transformer3DModel
 import os
 import gc
 from openai import OpenAI
+import re
 # Load Hugging Face token if needed
 hf_token = os.getenv("HF_TOKEN")
 openai_api_key = os.getenv("OPENAI_API_KEY")
 client = OpenAI(api_key=openai_api_key)
+# Initialize translation pipeline
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
+# Korean text detection function
+def contains_korean(text):
+    korean_pattern = re.compile('[ㄱ-ㅎㅏ-ㅣ가-힣]')
+    return bool(korean_pattern.search(text))
+def translate_korean_prompt(prompt):
+    """
+    Translate Korean prompt to English if Korean text is detected
+    """
+    if contains_korean(prompt):
+        translated = translator(prompt)[0]['translation_text']
+        print(f"Original Korean prompt: {prompt}")
+        print(f"Translated English prompt: {translated}")
+        return translated
+    return prompt
+# Load system prompts
 system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
 system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
 with open(system_prompt_t2v_path, "r") as f:
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
     vae_config_path = vae_dir / "config.json"
     vae.load_state_dict(vae_state_dict)
     return vae.to(device=device, dtype=torch.bfloat16)
 def load_unet(unet_dir):
     unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
     unet_config_path = unet_dir / "config.json"
     transformer.load_state_dict(unet_state_dict, strict=True)
     return transformer.to(device=device, dtype=torch.bfloat16)
 def load_scheduler(scheduler_dir):
     scheduler_config_path = scheduler_dir / "scheduler_config.json"
     scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
     return RectifiedFlowScheduler.from_config(scheduler_config)
 # Helper function for image processing
 def center_crop_and_resize(frame, target_height, target_width):
     h, w, _ = frame.shape
     frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
     return frame_resized
 def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
     image = Image.open(image_path).convert("RGB")
     image_np = np.array(image)
     frame_tensor = (frame_tensor / 127.5) - 1.0
     return frame_tensor.unsqueeze(0).unsqueeze(2)
 def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
     if not enhance_toggle:
         print("Enhance toggle is off, Prompt: ", prompt)
     try:
         response = client.chat.completions.create(
+            model="gpt-4-1106-preview",
             messages=messages,
             max_tokens=200,
         )
         print(f"Error: {e}")
         return prompt
 # Preset options for resolution and frame configuration
 preset_options = [
     {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
     {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
 ]
 def preset_changed(preset):
     if preset != "Custom":
         selected = next(item for item in preset_options if item["label"] == preset)
             gr.update(visible=True),
         )
 # Load models
 vae = load_vae(vae_dir)
 unet = load_unet(unet_dir)
     vae=vae,
 ).to(device)
 def generate_video_from_text(
     prompt="",
     enhance_prompt_toggle=False,
 ):
     if len(prompt.strip()) < 50:
         raise gr.Error(
+            "프롬프트는 최소 50자 이상이어야 합니다. 더 자세한 설명을 제공해주세요.",
             duration=5,
         )
+    # Translate Korean prompts to English
+    prompt = translate_korean_prompt(prompt)
+    negative_prompt = translate_korean_prompt(negative_prompt)
+    if enhance_prompt_toggle:
+        prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
     sample = {
         "prompt": prompt,
             ).images
     except Exception as e:
         raise gr.Error(
+            f"비디오 생성 중 오류가 발생했습니다. 다시 시도해주세요. 오류: {e}",
             duration=5,
         )
     finally:
     for frame in video_np[..., ::-1]:
         out.write(frame)
     out.release()
     del images
     del video_np
     torch.cuda.empty_cache()
     return output_path
 def generate_video_from_image(
     image_path,
     prompt="",
     num_frames=121,
     progress=gr.Progress(),
 ):
     print("Height: ", height)
     print("Width: ", width)
     print("Num Frames: ", num_frames)
     if len(prompt.strip()) < 50:
         raise gr.Error(
+            "프롬프트는 최소 50자 이상이어야 합니다. 더 자세한 설명을 제공해주세요.",
             duration=5,
         )
     if not image_path:
+        raise gr.Error("입력 이미지를 제공해주세요.", duration=5)
+    # Translate Korean prompts to English
+    prompt = translate_korean_prompt(prompt)
+    negative_prompt = translate_korean_prompt(negative_prompt)
     media_items = (
         load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
     )
+    if enhance_prompt_toggle:
+        prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
     sample = {
         "prompt": prompt,
         out.release()
     except Exception as e:
         raise gr.Error(
+            f"비디오 생성 중 오류가 발생했습니다. 다시 시도해주세요. 오류: {e}",
             duration=5,
         )
     return output_path
 def create_advanced_options():
     with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
         seed = gr.Slider(
             num_frames_slider,
         ]
+# Gradio Interface Definition
 with gr.Blocks(theme=gr.themes.Soft()) as iface:
     with gr.Row(elem_id="title-row"):
         gr.Markdown(
         """
         )
     with gr.Row(elem_id="title-row"):
+        gr.HTML(
             """
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/Lightricks/LTX-Video">
     ):
         gr.Markdown(
             """
+        📝 프롬프트 작성 팁
+        프롬프트 작성 시 동작과 장면에 대한 상세하고 시간 순서대로 된 설명에 집중하세요. 구체적인 움직임, 외모, 카메라 각도, 환경 세부 사항을 포함하되 하나의 문단으로 자연스럽게 작성하세요. 동작으로 바로 시작하고, 설명은 문자 그대로 정확하게 해주세요. 촬영 감독이 촬영 목록을 설명하는 것처럼 생각하세요. 200단어 이내로 작성하세요.
+        프롬프트는 다음 구조로 작성하면 좋습니다:
+        - 주요 동작을 한 문장으로 시작
+        - 구체적인 동작과 제스처 추가
+        - 캐릭터/객체의 외모를 정확히 설명
+        - ��경과 환경 세부 사항 포함
+        - 카메라 각도와 움직임 지정
+        - 조명과 색상 설명
+        - 변화나 갑작스러운 사건 기록
+        예시를 참고하세요.
+        🎮 매개변수 가이드
+        - 해상도 프리셋: 상세한 장면은 높은 해상도, 단순한 장면은 낮은 해상도 선택
+        - Seed: 특정 스타일이나 구성을 재현하고 싶을 때 seed 값 저장
+        - Guidance Scale: 3-3.5가 권장값
+        - Inference Steps: 품질을 위해서는 40+ 단계, 속도를 위해서는 20-30 단계
         """
         )
     with gr.Tabs():
         # Text to Video Tab
+        with gr.TabItem("텍스트로 비디오 만들기"):
             with gr.Row():
                 with gr.Column():
                     txt2vid_prompt = gr.Textbox(
+                        label="Step 1: 프롬프트 입력",
+                        placeholder="생성하고 싶은 비디오를 설명하세요 (최소 50자)...",
+                        value="갈색 긴 머리를 가진 여성이 금발의 긴 머리를 가진 다른 여성을 향해 미소짓습니다. 갈색 머리의 여성은 검은색 자켓을 입고 있으며 오른쪽 뺨에 작은 점이 있습니다. 카메라 각도는 갈색 머리 여성의 얼굴에 클로즈업되어 있습니다. 조명은 자연스럽고 따뜻하며, 석양에서 오는 듯한 부드러운 빛이 장면을 비춥니다. 장면은 실제 영상처럼 보입니다.",
                         lines=5,
                     )
                     txt2vid_enhance_toggle = Toggle(
+                        label="프롬프트 개선",
                         value=False,
                         interactive=True,
                     )
                     txt2vid_negative_prompt = gr.Textbox(
+                        label="Step 2: 네거티브 프롬프트 입력",
+                        placeholder="비디오에서 원하지 않는 요소를 설명하세요...",
+                        value="낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                         lines=2,
                     )
                     txt2vid_preset = gr.Dropdown(
                         choices=[p["label"] for p in preset_options],
                         value="768x512, 97 frames",
+                        label="Step 3.1: 해상도 프리셋 선택",
                     )
                     txt2vid_frame_rate = gr.Slider(
+                        label="Step 3.2: 프레임 레이트",
                         minimum=21,
                         maximum=30,
                         step=1,
                     txt2vid_advanced = create_advanced_options()
                     txt2vid_generate = gr.Button(
+                        "Step 5: 비디오 생성",
                         variant="primary",
                         size="lg",
                     )
                 with gr.Column():
+                    txt2vid_output = gr.Video(label="생성된 비디오")
             with gr.Row():
                 gr.Examples(
                     examples=[
                         [
+                            "전통적인 몽골 드레스를 입은 젊은 여성이 얇은 흰색 커튼을 통해 호기심과 긴장이 섞인 표정으로 들여다보고 있습니다. 여성은 흰 구슬로 장식된 두 개의 땋은 머리로 스타일링된 긴 검은 머리를 하고 있으며, 눈은 놀람을 띄며 크게 떠져 있습니다. 그녀의 드레스는 화려한 금색 자수가 새겨진 선명한 파란색이며, 비슷한 디자인의 머리띠를 하고 있습니다. 배경은 신비로움과 호기심을 자아내는 단순한 흰색 커튼입니다.",
+                            "낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                             "assets/t2v_2.mp4",
                         ],
                         [
+                            "노란색 재킷을 입은 금발 머리의 젊은 남자가 숲에 서서 주위를 둘러봅니다. 그는 밝은 피부를 가졌고 머리는 가운데 가르마로 스타일링되어 있습니다. 그는 왼쪽을 보고 난 후 오른쪽을 보며, 각 방향을 잠시 응시합니다. 카메라는 낮은 각도에서 남자를 올려다보며 고정되어 있습니다. 배경은 약간 흐릿하며, 녹색 나무들과 남자의 뒤에서 밝게 비치는 태양이 보입니다. 조명은 자연스럽고 따뜻하며, 태양 빛이 남자의 얼굴을 가로지르는 렌즈 플레어를 만듭니다. 장면은 실제 영상처럼 촬영되었습니다.",
+                            "낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                             "assets/t2v_1.mp4",
                         ],
                         [
+                            "한 사이클리스트가 굽이진 산길을 따라 달립니다. 공기역학적인 장비를 입은 그는 강하게 페달을 밟고 있으며, 이마에는 땀방울이 반짝입니다. 카메라는 그의 결연한 표정과 숨 막히는 풍경을 번갈아가며 보여줍니다. 소나무들이 스쳐 지나가고, 하늘은 선명한 파란색입니다. 이 장면은 활기차고 경쟁적인 분위기를 자아냅니다.",
+                            "낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                             "assets/t2v_0.mp4",
                         ],
                     ],
                     inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output],
+                    label="텍스트-비디오 생성 예시",
                 )
         # Image to Video Tab
+        with gr.TabItem("이미지로 비디오 만들기"):
             with gr.Row():
                 with gr.Column():
                     img2vid_image = gr.Image(
                         type="filepath",
+                        label="Step 1: 입력 이미지 업로드",
                         elem_id="image_upload",
                     )
                     img2vid_prompt = gr.Textbox(
+                        label="Step 2: 프롬프트 입력",
+                        placeholder="이미지를 어떻게 애니메이션화할지 설명하세요 (최소 50자)...",
+                        value="갈색 긴 머리를 가진 여성이 금발의 긴 머리를 가진 다른 여성을 향해 미소짓습니다. 갈색 머리의 여성은 검은색 자켓을 입고 있으며 오른쪽 뺨에 작은 점이 있습니다. 카메라 각도는 갈색 머리 여성의 얼굴에 클로즈업되어 있습니다. 조명은 자연스럽고 따뜻하며, 석양에서 오는 듯한 부드러운 빛이 장면을 비춥니다. 장면은 실제 영상처럼 보입니다.",
                         lines=5,
                     )
                     img2vid_enhance_toggle = Toggle(
+                        label="프롬프트 개선",
                         value=False,
                         interactive=True,
                     )
                     img2vid_negative_prompt = gr.Textbox(
+                        label="Step 3: 네거티브 프롬프트 입력",
+                        placeholder="비디오에서 원하지 않는 요소를 설명하세요...",
+                        value="낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                         lines=2,
                     )
                     img2vid_preset = gr.Dropdown(
                         choices=[p["label"] for p in preset_options],
                         value="768x512, 97 frames",
+                        label="Step 3.1: 해상도 프리셋 선택",
                     )
                     img2vid_frame_rate = gr.Slider(
+                        label="Step 3.2: 프레임 레이트",
                         minimum=21,
                         maximum=30,
                         step=1,
                     img2vid_advanced = create_advanced_options()
                     img2vid_generate = gr.Button(
+                        "Step 6: 비디오 생성", variant="primary", size="lg"
                     )
                 with gr.Column():
+                    img2vid_output = gr.Video(label="생성된 비디오")
             with gr.Row():
                 gr.Examples(
                     examples=[
                         [
                             "assets/i2v_i2.png",
+                            "여성이 흰색 전기 버너 위에서 끓는 물이 담긴 냄비를 젓고 있습니다. 보라색 매니큐어를 바른 그녀의 손이 하얀 냄비 안에서 나무 숟가락을 원형으로 움직입니다. 냄비는 검은색 버튼과 디지털 디스플레이가 있는 흰색 전기 버너 위에 놓여 있습니다. 버너는 오른쪽 아래 모서리에 빨간색과 흰색 체크무늬 천이 부분적으로 보이는 흰색 조리대 위에 놓여 있습니다. 카메라 각도는 정확히 위에서 내려다보는 각도이며 장면 내내 고정되어 있습니다. 조명은 밝고 고른 중성적인 흰색 빛으로 장면을 비춥니다. 장면은 실제 영상처럼 보입니다.",
+                            "낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                             "assets/i2v_2.mp4",
                         ],
                         [
                             "assets/i2v_i0.png",
+                            "긴 흐르는 드레스를 입은 여성이 들판에 서서 등을 카메라를 향한 채 지평선을 바라보고 있습니다. 그녀의 머리카락은 길고 밝으며 등 아래로 흘러내립니다. 그녀는 큰 참나무의 넓게 퍼진 가지 아래에 서 있습니다. 왼쪽으로는 말라붙은 잔디 위에 클래식한 미국 자동차가 주차되어 있습니다. 멀리서는 한 대의 부서진 자동차가 옆으로 누워 있습니다. 위의 하늘은 어두운 하늘을 배경으로 밝은 흰 구름이 극적인 캔버스를 이루고 있습니다. 전체 이미지는 흑백으로, 빛과 그림자의 대비를 강조합니다. 여성이 천천히 자동차를 향해 걸어가고 있습니다.",
+                            "낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                             "assets/i2v_0.mp4",
                         ],
                         [
                             "assets/i2v_i1.png",
+                            "한 쌍의 손이 도자기 물레 위에서 점토 조각을 모양 잡아 점차적으로 원뿔 모양을 만들어가고 있습니다. 프레임 밖의 사람의 손이 점토로 덮여 있으며, 회전하는 도자기 물레 중앙에 점토 덩어리를 부드럽게 누르고 있습니다. 손은 원형으로 움직이며, 점토 위쪽에 점차적으로 원뿔 모양을 만들어갑니다. 카메라는 도자기 물레 바로 위에 위치하여 점토가 모양 잡혀가는 것을 조감도로 보여줍니다. 조명은 밝고 고르며, 점토와 그것을 다루는 손을 밝게 비춥니다. 장면은 실제 영상처럼 촬영되었습니다.",
+                            "낮은 품질, 최악의 품질, 기형, 왜곡된, 일그러진, 모션 스미어, 모션 아티팩트, 융합된 손가락, 잘못된 해부학, 이상한 손, 추한",
                             "assets/i2v_1.mp4",
                         ],
                     ],
                         img2vid_negative_prompt,
                         img2vid_output,
                     ],
+                    label="이미지-비디오 생성 예시",
                 )
+    # Event handlers
     txt2vid_preset.change(
         fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
     )
 if __name__ == "__main__":
     iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
         share=True, show_api=False
+    )