InstaVideo

Running on Zero

App Files Files Community

rahul7star commited on 24 days ago

Commit

4a08685

verified ·

1 Parent(s): 649174b

Update wan2_fast.py

Browse files

Files changed (1) hide show

wan2_fast.py +53 -77

wan2_fast.py CHANGED Viewed

@@ -56,6 +56,10 @@ pipe.to("cuda")
 # pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
 # pipe.set_adapters(["causvid_lora"], adapter_weights=[0.95])
 # pipe.fuse_lora()
 # MOD_VALUE = 32
 # DEFAULT_H_SLIDER_VALUE =  512
@@ -96,45 +100,56 @@ pipe.to("cuda")
 #New math to make it High Res
-MOD_VALUE = 32
-# Defaults for higher-res generation
-DEFAULT_H_SLIDER_VALUE = 768
-DEFAULT_W_SLIDER_VALUE = 1344  # 16:9 friendly and divisible by MOD_VALUE
-# Original Space = Hugging Face space with compute limits
-IS_ORIGINAL_SPACE = os.environ.get("IS_ORIGINAL_SPACE", "True") == "True"
-# Conservative limits for low-end environments
-LIMITED_MAX_RESOLUTION = 640
-LIMITED_MAX_DURATION = 2.0
-LIMITED_MAX_STEPS = 4
-# Generous limits for local or Pro spaces
-ORIGINAL_SLIDER_MIN_H, ORIGINAL_SLIDER_MAX_H = 128, 1536
-ORIGINAL_SLIDER_MIN_W, ORIGINAL_SLIDER_MAX_W = 128, 1536
-ORIGINAL_MAX_DURATION = round(81 / 24, 1)  # 3.4 seconds
-ORIGINAL_MAX_STEPS = 8
-# Use limited or original (generous) settings
-if IS_ORIGINAL_SPACE:
-    SLIDER_MIN_H, SLIDER_MAX_H = 128, LIMITED_MAX_RESOLUTION
-    SLIDER_MIN_W, SLIDER_MAX_W = 128, LIMITED_MAX_RESOLUTION
-    MAX_DURATION = LIMITED_MAX_DURATION
-    MAX_STEPS = LIMITED_MAX_STEPS
-else:
-    SLIDER_MIN_H, SLIDER_MAX_H = ORIGINAL_SLIDER_MIN_H, ORIGINAL_SLIDER_MAX_H
-    SLIDER_MIN_W, SLIDER_MAX_W = ORIGINAL_SLIDER_MIN_W, ORIGINAL_SLIDER_MAX_W
-    MAX_DURATION = ORIGINAL_MAX_DURATION
-    MAX_STEPS = ORIGINAL_MAX_STEPS
-MAX_SEED = np.iinfo(np.int32).max
-FIXED_FPS = 24
-FIXED_OUTPUT_FPS = 18  # reduce final video FPS to save space
-MIN_FRAMES_MODEL = 8
-MAX_FRAMES_MODEL = 81
 default_prompt_t2v = "cinematic footage, group of pedestrians dancing in the streets of NYC, high quality breakdance, 4K, tiktok video, intricate details, instagram feel, dynamic camera, smooth dance motion, dimly lit, stylish, beautiful faces, smiling, music video"
 default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
@@ -172,45 +187,7 @@ def generate_video(prompt, height, width,
                    guidance_scale = 1, steps = 4,
                    seed = 42, randomize_seed = False,
                    progress=gr.Progress(track_tqdm=True)):
-    """
-    Generate a video from a text prompt using the Wan 2.1 T2V model with CausVid LoRA.
-    This function takes a text prompt and generates a video based on the provided
-    prompt and parameters. It uses the Wan 2.1 1.3B Text-to-Video model with CausVid LoRA
-    for fast generation in 3-8 steps.
-    Args:
-        prompt (str): Text prompt describing the desired video content.
-        height (int): Target height for the output video. Will be adjusted to multiple of MOD_VALUE (32).
-        width (int): Target width for the output video. Will be adjusted to multiple of MOD_VALUE (32).
-        negative_prompt (str, optional): Negative prompt to avoid unwanted elements.
-            Defaults to default_negative_prompt (contains unwanted visual artifacts).
-        duration_seconds (float, optional): Duration of the generated video in seconds.
-            Defaults to 2. Clamped between MIN_FRAMES_MODEL/FIXED_FPS and MAX_FRAMES_MODEL/FIXED_FPS.
-        guidance_scale (float, optional): Controls adherence to the prompt. Higher values = more adherence.
-            Defaults to 1.0. Range: 0.0-20.0.
-        steps (int, optional): Number of inference steps. More steps = higher quality but slower.
-            Defaults to 4. Range: 1-30.
-        seed (int, optional): Random seed for reproducible results. Defaults to 42.
-            Range: 0 to MAX_SEED (2147483647).
-        randomize_seed (bool, optional): Whether to use a random seed instead of the provided seed.
-            Defaults to False.
-        progress (gr.Progress, optional): Gradio progress tracker. Defaults to gr.Progress(track_tqdm=True).
-    Returns:
-        tuple: A tuple containing:
-            - video_path (str): Path to the generated video file (.mp4)
-            - current_seed (int): The seed used for generation (useful when randomize_seed=True)
-    Raises:
-        gr.Error: If prompt is empty or None.
-    Note:
-        - Frame count is calculated as duration_seconds * FIXED_FPS (24)
-        - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
-        - The function uses GPU acceleration via the @spaces.GPU decorator
-        - Generation time varies based on steps and duration (see get_duration function)
-    """
     if not prompt or prompt.strip() == "":
         raise gr.Error("Please enter a text prompt. Try to use long and precise descriptions.")
@@ -246,9 +223,8 @@ def generate_video(prompt, height, width,
 with gr.Blocks(css="body { max-width: 100vw; overflow-x: hidden; }") as demo:
     gr.HTML('<meta name="viewport" content="width=device-width, initial-scale=1">')
     # ... your other components here ...
-    gr.Markdown("# ⚡ InstaVideo")
-    gr.Markdown("This Gradio space is a fork of [wan2-1-fast from multimodalart](https://huggingface.co/spaces/multimodalart/wan2-1-fast), and is powered by the Wan CausVid LoRA [from Kijai](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_bidirect2_T2V_1_3B_lora_rank32.safetensors).")
     # Add notice for limited spaces
     if IS_ORIGINAL_SPACE:
         gr.Markdown("⚠️ **This free public demo limits the resolution to 640px, duration to 2s, and inference steps to 4. For full capabilities please duplicate this space.**")

 # pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
 # pipe.set_adapters(["causvid_lora"], adapter_weights=[0.95])
 # pipe.fuse_lora()
+#####################################################
 # MOD_VALUE = 32
 # DEFAULT_H_SLIDER_VALUE =  512
 #New math to make it High Res
+# MOD_VALUE = 32
+# # Defaults for higher-res generation
+# DEFAULT_H_SLIDER_VALUE = 768
+# DEFAULT_W_SLIDER_VALUE = 1344  # 16:9 friendly and divisible by MOD_VALUE
+# # Original Space = Hugging Face space with compute limits
+# IS_ORIGINAL_SPACE = os.environ.get("IS_ORIGINAL_SPACE", "True") == "True"
+# # Conservative limits for low-end environments
+# LIMITED_MAX_RESOLUTION = 640
+# LIMITED_MAX_DURATION = 2.0
+# LIMITED_MAX_STEPS = 4
+# # Generous limits for local or Pro spaces
+# ORIGINAL_SLIDER_MIN_H, ORIGINAL_SLIDER_MAX_H = 128, 1536
+# ORIGINAL_SLIDER_MIN_W, ORIGINAL_SLIDER_MAX_W = 128, 1536
+# ORIGINAL_MAX_DURATION = round(81 / 24, 1)  # 3.4 seconds
+# ORIGINAL_MAX_STEPS = 8
+# # Use limited or original (generous) settings
+# if IS_ORIGINAL_SPACE:
+#     SLIDER_MIN_H, SLIDER_MAX_H = 128, LIMITED_MAX_RESOLUTION
+#     SLIDER_MIN_W, SLIDER_MAX_W = 128, LIMITED_MAX_RESOLUTION
+#     MAX_DURATION = LIMITED_MAX_DURATION
+#     MAX_STEPS = LIMITED_MAX_STEPS
+# else:
+#     SLIDER_MIN_H, SLIDER_MAX_H = ORIGINAL_SLIDER_MIN_H, ORIGINAL_SLIDER_MAX_H
+#     SLIDER_MIN_W, SLIDER_MAX_W = ORIGINAL_SLIDER_MIN_W, ORIGINAL_SLIDER_MAX_W
+#     MAX_DURATION = ORIGINAL_MAX_DURATION
+#     MAX_STEPS = ORIGINAL_MAX_STEPS
+# MAX_SEED = np.iinfo(np.int32).max
+# FIXED_FPS = 24
+# FIXED_OUTPUT_FPS = 18  # reduce final video FPS to save space
+# MIN_FRAMES_MODEL = 8
+# MAX_FRAMES_MODEL = 81
+# Constants
+MOD_VALUE = 32
+DEFAULT_H_SLIDER_VALUE = 896
+DEFAULT_W_SLIDER_VALUE = 896
+NEW_FORMULA_MAX_AREA = 720 * 1024
+SLIDER_MIN_H, SLIDER_MAX_H = 256, 1024
+SLIDER_MIN_W, SLIDER_MAX_W = 256, 1024
+MAX_SEED = np.iinfo(np.int32).max
+FIXED_FPS = 24
+MIN_FRAMES_MODEL = 25
+MAX_FRAMES_MODEL = 193
 default_prompt_t2v = "cinematic footage, group of pedestrians dancing in the streets of NYC, high quality breakdance, 4K, tiktok video, intricate details, instagram feel, dynamic camera, smooth dance motion, dimly lit, stylish, beautiful faces, smiling, music video"
 default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
                    guidance_scale = 1, steps = 4,
                    seed = 42, randomize_seed = False,
                    progress=gr.Progress(track_tqdm=True)):
     if not prompt or prompt.strip() == "":
         raise gr.Error("Please enter a text prompt. Try to use long and precise descriptions.")
 with gr.Blocks(css="body { max-width: 100vw; overflow-x: hidden; }") as demo:
     gr.HTML('<meta name="viewport" content="width=device-width, initial-scale=1">')
     # ... your other components here ...
+    gr.Markdown("# ⚡ InstaVideo - FastWan2.2 Demo")
     # Add notice for limited spaces
     if IS_ORIGINAL_SPACE:
         gr.Markdown("⚠️ **This free public demo limits the resolution to 640px, duration to 2s, and inference steps to 4. For full capabilities please duplicate this space.**")