FramePack-F1

Running on Zero

App Files Files Community

This Pull Request also extends a video & optimizes time & VRAM

by Fabrice-TIERCELIN - opened Jun 18

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+3868

-316

Files changed (14) hide show

.gitattributes +7 -0
README.md +11 -3
app.py +0 -0
app_lora.py +0 -0
diffusers_helper/bucket_tools.py +74 -1
diffusers_helper/models/hunyuan_video_packed.py +21 -24
img_examples/{1.png → Example1.mp4} +2 -2
img_examples/{2.jpg → Example1.png} +2 -2
img_examples/{3.png → Example2.webp} +2 -2
img_examples/Example3.jpg +3 -0
img_examples/Example4.webp +3 -0
img_examples/Example5.png +3 -0
img_examples/Example6.png +3 -0
requirements.txt +23 -18

.gitattributes CHANGED Viewed

@@ -36,3 +36,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 img_examples/1.png filter=lfs diff=lfs merge=lfs -text
 img_examples/2.jpg filter=lfs diff=lfs merge=lfs -text
 img_examples/3.png filter=lfs diff=lfs merge=lfs -text

 img_examples/1.png filter=lfs diff=lfs merge=lfs -text
 img_examples/2.jpg filter=lfs diff=lfs merge=lfs -text
 img_examples/3.png filter=lfs diff=lfs merge=lfs -text
+img_examples/Example1.mp4 filter=lfs diff=lfs merge=lfs -text
+img_examples/Example1.png filter=lfs diff=lfs merge=lfs -text
+img_examples/Example2.webp filter=lfs diff=lfs merge=lfs -text
+img_examples/Example3.jpg filter=lfs diff=lfs merge=lfs -text
+img_examples/Example4.webp filter=lfs diff=lfs merge=lfs -text
+img_examples/Example5.png filter=lfs diff=lfs merge=lfs -text
+img_examples/Example6.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -4,11 +4,19 @@ emoji: 📹⚡️
 colorFrom: pink
 colorTo: gray
 sdk: gradio
-sdk_version: 5.32.0
-app_file: app.py
 pinned: true
 license: apache-2.0
-short_description: fast video generation from images & text
 ---
 paper: arxiv:2504.12626
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: pink
 colorTo: gray
 sdk: gradio
 pinned: true
+sdk_version: 5.29.1
+app_file: app.py
 license: apache-2.0
+short_description: Text-to-Video/Image-to-Video/Video extender (timed prompt)
+tags:
+  - Image-to-Video
+  - Image-2-Video
+  - Img-to-Vid
+  - Img-2-Vid
+  - language models
+  - LLMs
+suggested_hardware: zero-a10g
 ---
 paper: arxiv:2504.12626
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

app_lora.py ADDED Viewed

The diff for this file is too large to render. See raw diff

diffusers_helper/bucket_tools.py CHANGED Viewed

@@ -15,6 +15,79 @@ bucket_options = {
         (864, 448),
         (960, 416),
     ],
 }
@@ -26,5 +99,5 @@ def find_nearest_bucket(h, w, resolution=640):
         if metric <= min_metric:
             min_metric = metric
             best_bucket = (bucket_h, bucket_w)
     return best_bucket

         (864, 448),
         (960, 416),
     ],
+    672: [
+        (480, 864),
+        (512, 832),
+        (544, 768),
+        (576, 704),
+        (608, 672),
+        (640, 640),
+        (672, 608),
+        (704, 576),
+        (768, 544),
+        (832, 512),
+        (864, 480),
+    ],
+    704: [
+        (480, 960),
+        (512, 864),
+        (544, 832),
+        (576, 768),
+        (608, 704),
+        (640, 672),
+        (672, 640),
+        (704, 608),
+        (768, 576),
+        (832, 544),
+        (864, 512),
+        (960, 480),
+    ],
+    768: [
+        (512, 960),
+        (544, 864),
+        (576, 832),
+        (608, 768),
+        (640, 704),
+        (672, 672),
+        (704, 640),
+        (768, 608),
+        (832, 576),
+        (864, 544),
+        (960, 512),
+    ],
+    832: [
+        (544, 960),
+        (576, 864),
+        (608, 832),
+        (640, 768),
+        (672, 704),
+        (704, 672),
+        (768, 640),
+        (832, 608),
+        (864, 576),
+        (960, 544),
+    ],
+    864: [
+        (576, 960),
+        (608, 864),
+        (640, 832),
+        (672, 768),
+        (704, 704),
+        (768, 672),
+        (832, 640),
+        (864, 608),
+        (960, 576),
+    ],
+    960: [
+        (608, 960),
+        (640, 864),
+        (672, 832),
+        (704, 768),
+        (768, 704),
+        (832, 672),
+        (864, 640),
+        (960, 608),
+    ],
 }
         if metric <= min_metric:
             min_metric = metric
             best_bucket = (bucket_h, bucket_w)
+    print("The resolution of the generated video will be " + str(best_bucket))
     return best_bucket

diffusers_helper/models/hunyuan_video_packed.py CHANGED Viewed

@@ -122,21 +122,17 @@ def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seq
         x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
         return x
-    B, L, H, C = q.shape
-    q = q.flatten(0, 1)
-    k = k.flatten(0, 1)
-    v = v.flatten(0, 1)
     if sageattn_varlen is not None:
         x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
     elif flash_attn_varlen_func is not None:
         x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
     else:
         raise NotImplementedError('No Attn Installed!')
-    x = x.unflatten(0, (B, L))
     return x
@@ -362,7 +358,7 @@ class HunyuanVideoIndividualTokenRefiner(nn.Module):
             batch_size = attention_mask.shape[0]
             seq_len = attention_mask.shape[1]
             attention_mask = attention_mask.to(hidden_states.device).bool()
-            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
             self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
             self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
             self_attn_mask[:, :, :, 0] = True
@@ -930,22 +926,23 @@ class HunyuanVideoTransformer3DModelPacked(ModelMixin, ConfigMixin, PeftAdapterM
             encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
             encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
-        if batch_size == 1:
-            # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
-            # If they are not same, then their impls are wrong. Ours are always the correct one.
-            text_len = encoder_attention_mask.sum().item()
-            encoder_hidden_states = encoder_hidden_states[:, :text_len]
-            attention_mask = None, None, None, None
-        else:
-            img_seq_len = hidden_states.shape[1]
-            txt_seq_len = encoder_hidden_states.shape[1]
-            cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
-            cu_seqlens_kv = cu_seqlens_q
-            max_seqlen_q = img_seq_len + txt_seq_len
-            max_seqlen_kv = max_seqlen_q
-            attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
         if self.enable_teacache:
             modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]

         x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
         return x
+    batch_size = q.shape[0]
+    q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
+    k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
+    v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
     if sageattn_varlen is not None:
         x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
     elif flash_attn_varlen_func is not None:
         x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
     else:
         raise NotImplementedError('No Attn Installed!')
+    x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
     return x
             batch_size = attention_mask.shape[0]
             seq_len = attention_mask.shape[1]
             attention_mask = attention_mask.to(hidden_states.device).bool()
+            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).expand(-1, -1, seq_len, -1)
             self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
             self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
             self_attn_mask[:, :, :, 0] = True
             encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
             encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
+        with torch.no_grad():
+            if batch_size == 1:
+                # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
+                # If they are not same, then their impls are wrong. Ours are always the correct one.
+                text_len = encoder_attention_mask.sum().item()
+                encoder_hidden_states = encoder_hidden_states[:, :text_len]
+                attention_mask = None, None, None, None
+            else:
+                img_seq_len = hidden_states.shape[1]
+                txt_seq_len = encoder_hidden_states.shape[1]
+                cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
+                cu_seqlens_kv = cu_seqlens_q
+                max_seqlen_q = img_seq_len + txt_seq_len
+                max_seqlen_kv = max_seqlen_q
+                attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
         if self.enable_teacache:
             modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]

img_examples/{1.png → Example1.mp4} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7d3490cb499fdbf55d64ad2f06e7c7e7a336245ba2cff50ddb2c9b47299cdae
-size 1329228

 version https://git-lfs.github.com/spec/v1
+oid sha256:a906a1d14d1699f67ca54865c7aa5857e55246f4ec63bbaf3edcf359e73bebd1
+size 240647

img_examples/{2.jpg → Example1.png} RENAMED Viewed

File without changes

img_examples/{3.png → Example2.webp} RENAMED Viewed

File without changes

img_examples/Example3.jpg ADDED Viewed

Git LFS Details

SHA256: b1a9be93d2f117d687e08c91c043e67598bdb7c44f5c932f18a3026790fb82fa
Pointer size: 131 Bytes
Size of remote file: 208 kB

img_examples/Example4.webp ADDED Viewed

Git LFS Details

SHA256: dd4e7ef35f4cfc8d44ff97f38b68ba7cc248ad5b54c89f8525f5046508f7c4a3
Pointer size: 131 Bytes
Size of remote file: 119 kB

img_examples/Example5.png ADDED Viewed

Git LFS Details

SHA256: b6a7b7521a2ffe77f60a78bb52013c1ef73bfcefbd809f45cfdeef804aee8906
Pointer size: 131 Bytes
Size of remote file: 431 kB

img_examples/Example6.png ADDED Viewed

Git LFS Details

SHA256: 59e76d165d9bece1775302a7e4032f31b28545937726d42f41b0c67aae9d4143
Pointer size: 131 Bytes
Size of remote file: 721 kB

requirements.txt CHANGED Viewed

@@ -1,18 +1,23 @@
-accelerate==1.6.0
-diffusers==0.33.1
-transformers==4.46.2
-sentencepiece==0.2.0
-pillow==11.1.0
-av==12.1.0
-numpy==1.26.2
-scipy==1.12.0
-requests==2.31.0
-torchsde==0.2.6
-torch>=2.0.0
-torchvision
-torchaudio
-einops
-opencv-contrib-python
-safetensors
-huggingface_hub
-spaces

+accelerate==1.7.0
+diffusers==0.33.1
+transformers==4.52.4
+sentencepiece==0.2.0
+pillow==11.2.1
+av==12.1.0
+numpy==1.26.2
+scipy==1.12.0
+requests==2.32.4
+torchsde==0.2.6
+torch>=2.0.0
+torchvision
+torchaudio
+einops
+opencv-contrib-python
+safetensors
+huggingface_hub
+decord
+imageio_ffmpeg==0.6.0
+sageattention==1.0.6
+xformers==0.0.29.post3
+bitsandbytes==0.46.0
+pillow-heif==0.22.0