This Pull Request also extends a video & optimizes time & VRAM

#1
.gitattributes CHANGED
@@ -36,3 +36,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  img_examples/1.png filter=lfs diff=lfs merge=lfs -text
37
  img_examples/2.jpg filter=lfs diff=lfs merge=lfs -text
38
  img_examples/3.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
36
  img_examples/1.png filter=lfs diff=lfs merge=lfs -text
37
  img_examples/2.jpg filter=lfs diff=lfs merge=lfs -text
38
  img_examples/3.png filter=lfs diff=lfs merge=lfs -text
39
+ img_examples/Example1.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ img_examples/Example1.png filter=lfs diff=lfs merge=lfs -text
41
+ img_examples/Example2.webp filter=lfs diff=lfs merge=lfs -text
42
+ img_examples/Example3.jpg filter=lfs diff=lfs merge=lfs -text
43
+ img_examples/Example4.webp filter=lfs diff=lfs merge=lfs -text
44
+ img_examples/Example5.png filter=lfs diff=lfs merge=lfs -text
45
+ img_examples/Example6.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -4,11 +4,19 @@ emoji: 📹⚡️
4
  colorFrom: pink
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.32.0
8
- app_file: app.py
9
  pinned: true
 
 
10
  license: apache-2.0
11
- short_description: fast video generation from images & text
 
 
 
 
 
 
 
 
12
  ---
13
  paper: arxiv:2504.12626
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: pink
5
  colorTo: gray
6
  sdk: gradio
 
 
7
  pinned: true
8
+ sdk_version: 5.29.1
9
+ app_file: app.py
10
  license: apache-2.0
11
+ short_description: Text-to-Video/Image-to-Video/Video extender (timed prompt)
12
+ tags:
13
+ - Image-to-Video
14
+ - Image-2-Video
15
+ - Img-to-Vid
16
+ - Img-2-Vid
17
+ - language models
18
+ - LLMs
19
+ suggested_hardware: zero-a10g
20
  ---
21
  paper: arxiv:2504.12626
22
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
app_lora.py ADDED
The diff for this file is too large to render. See raw diff
 
diffusers_helper/bucket_tools.py CHANGED
@@ -15,6 +15,79 @@ bucket_options = {
15
  (864, 448),
16
  (960, 416),
17
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
 
20
 
@@ -26,5 +99,5 @@ def find_nearest_bucket(h, w, resolution=640):
26
  if metric <= min_metric:
27
  min_metric = metric
28
  best_bucket = (bucket_h, bucket_w)
 
29
  return best_bucket
30
-
 
15
  (864, 448),
16
  (960, 416),
17
  ],
18
+ 672: [
19
+ (480, 864),
20
+ (512, 832),
21
+ (544, 768),
22
+ (576, 704),
23
+ (608, 672),
24
+ (640, 640),
25
+ (672, 608),
26
+ (704, 576),
27
+ (768, 544),
28
+ (832, 512),
29
+ (864, 480),
30
+ ],
31
+ 704: [
32
+ (480, 960),
33
+ (512, 864),
34
+ (544, 832),
35
+ (576, 768),
36
+ (608, 704),
37
+ (640, 672),
38
+ (672, 640),
39
+ (704, 608),
40
+ (768, 576),
41
+ (832, 544),
42
+ (864, 512),
43
+ (960, 480),
44
+ ],
45
+ 768: [
46
+ (512, 960),
47
+ (544, 864),
48
+ (576, 832),
49
+ (608, 768),
50
+ (640, 704),
51
+ (672, 672),
52
+ (704, 640),
53
+ (768, 608),
54
+ (832, 576),
55
+ (864, 544),
56
+ (960, 512),
57
+ ],
58
+ 832: [
59
+ (544, 960),
60
+ (576, 864),
61
+ (608, 832),
62
+ (640, 768),
63
+ (672, 704),
64
+ (704, 672),
65
+ (768, 640),
66
+ (832, 608),
67
+ (864, 576),
68
+ (960, 544),
69
+ ],
70
+ 864: [
71
+ (576, 960),
72
+ (608, 864),
73
+ (640, 832),
74
+ (672, 768),
75
+ (704, 704),
76
+ (768, 672),
77
+ (832, 640),
78
+ (864, 608),
79
+ (960, 576),
80
+ ],
81
+ 960: [
82
+ (608, 960),
83
+ (640, 864),
84
+ (672, 832),
85
+ (704, 768),
86
+ (768, 704),
87
+ (832, 672),
88
+ (864, 640),
89
+ (960, 608),
90
+ ],
91
  }
92
 
93
 
 
99
  if metric <= min_metric:
100
  min_metric = metric
101
  best_bucket = (bucket_h, bucket_w)
102
+ print("The resolution of the generated video will be " + str(best_bucket))
103
  return best_bucket
 
diffusers_helper/models/hunyuan_video_packed.py CHANGED
@@ -122,21 +122,17 @@ def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seq
122
  x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
123
  return x
124
 
125
- B, L, H, C = q.shape
126
-
127
- q = q.flatten(0, 1)
128
- k = k.flatten(0, 1)
129
- v = v.flatten(0, 1)
130
-
131
  if sageattn_varlen is not None:
132
  x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
133
  elif flash_attn_varlen_func is not None:
134
  x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
135
  else:
136
  raise NotImplementedError('No Attn Installed!')
137
-
138
- x = x.unflatten(0, (B, L))
139
-
140
  return x
141
 
142
 
@@ -362,7 +358,7 @@ class HunyuanVideoIndividualTokenRefiner(nn.Module):
362
  batch_size = attention_mask.shape[0]
363
  seq_len = attention_mask.shape[1]
364
  attention_mask = attention_mask.to(hidden_states.device).bool()
365
- self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
366
  self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
367
  self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
368
  self_attn_mask[:, :, :, 0] = True
@@ -930,22 +926,23 @@ class HunyuanVideoTransformer3DModelPacked(ModelMixin, ConfigMixin, PeftAdapterM
930
  encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
931
  encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
932
 
933
- if batch_size == 1:
934
- # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
935
- # If they are not same, then their impls are wrong. Ours are always the correct one.
936
- text_len = encoder_attention_mask.sum().item()
937
- encoder_hidden_states = encoder_hidden_states[:, :text_len]
938
- attention_mask = None, None, None, None
939
- else:
940
- img_seq_len = hidden_states.shape[1]
941
- txt_seq_len = encoder_hidden_states.shape[1]
 
942
 
943
- cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
944
- cu_seqlens_kv = cu_seqlens_q
945
- max_seqlen_q = img_seq_len + txt_seq_len
946
- max_seqlen_kv = max_seqlen_q
947
 
948
- attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
949
 
950
  if self.enable_teacache:
951
  modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
 
122
  x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
123
  return x
124
 
125
+ batch_size = q.shape[0]
126
+ q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
127
+ k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
128
+ v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
 
 
129
  if sageattn_varlen is not None:
130
  x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
131
  elif flash_attn_varlen_func is not None:
132
  x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
133
  else:
134
  raise NotImplementedError('No Attn Installed!')
135
+ x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
 
 
136
  return x
137
 
138
 
 
358
  batch_size = attention_mask.shape[0]
359
  seq_len = attention_mask.shape[1]
360
  attention_mask = attention_mask.to(hidden_states.device).bool()
361
+ self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).expand(-1, -1, seq_len, -1)
362
  self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
363
  self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
364
  self_attn_mask[:, :, :, 0] = True
 
926
  encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
927
  encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
928
 
929
+ with torch.no_grad():
930
+ if batch_size == 1:
931
+ # When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
932
+ # If they are not same, then their impls are wrong. Ours are always the correct one.
933
+ text_len = encoder_attention_mask.sum().item()
934
+ encoder_hidden_states = encoder_hidden_states[:, :text_len]
935
+ attention_mask = None, None, None, None
936
+ else:
937
+ img_seq_len = hidden_states.shape[1]
938
+ txt_seq_len = encoder_hidden_states.shape[1]
939
 
940
+ cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
941
+ cu_seqlens_kv = cu_seqlens_q
942
+ max_seqlen_q = img_seq_len + txt_seq_len
943
+ max_seqlen_kv = max_seqlen_q
944
 
945
+ attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
946
 
947
  if self.enable_teacache:
948
  modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
img_examples/{1.png → Example1.mp4} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7d3490cb499fdbf55d64ad2f06e7c7e7a336245ba2cff50ddb2c9b47299cdae
3
- size 1329228
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a906a1d14d1699f67ca54865c7aa5857e55246f4ec63bbaf3edcf359e73bebd1
3
+ size 240647
img_examples/{2.jpg → Example1.png} RENAMED
File without changes
img_examples/{3.png → Example2.webp} RENAMED
File without changes
img_examples/Example3.jpg ADDED

Git LFS Details

  • SHA256: b1a9be93d2f117d687e08c91c043e67598bdb7c44f5c932f18a3026790fb82fa
  • Pointer size: 131 Bytes
  • Size of remote file: 208 kB
img_examples/Example4.webp ADDED

Git LFS Details

  • SHA256: dd4e7ef35f4cfc8d44ff97f38b68ba7cc248ad5b54c89f8525f5046508f7c4a3
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
img_examples/Example5.png ADDED

Git LFS Details

  • SHA256: b6a7b7521a2ffe77f60a78bb52013c1ef73bfcefbd809f45cfdeef804aee8906
  • Pointer size: 131 Bytes
  • Size of remote file: 431 kB
img_examples/Example6.png ADDED

Git LFS Details

  • SHA256: 59e76d165d9bece1775302a7e4032f31b28545937726d42f41b0c67aae9d4143
  • Pointer size: 131 Bytes
  • Size of remote file: 721 kB
requirements.txt CHANGED
@@ -1,18 +1,23 @@
1
- accelerate==1.6.0
2
- diffusers==0.33.1
3
- transformers==4.46.2
4
- sentencepiece==0.2.0
5
- pillow==11.1.0
6
- av==12.1.0
7
- numpy==1.26.2
8
- scipy==1.12.0
9
- requests==2.31.0
10
- torchsde==0.2.6
11
- torch>=2.0.0
12
- torchvision
13
- torchaudio
14
- einops
15
- opencv-contrib-python
16
- safetensors
17
- huggingface_hub
18
- spaces
 
 
 
 
 
 
1
+ accelerate==1.7.0
2
+ diffusers==0.33.1
3
+ transformers==4.52.4
4
+ sentencepiece==0.2.0
5
+ pillow==11.2.1
6
+ av==12.1.0
7
+ numpy==1.26.2
8
+ scipy==1.12.0
9
+ requests==2.32.4
10
+ torchsde==0.2.6
11
+ torch>=2.0.0
12
+ torchvision
13
+ torchaudio
14
+ einops
15
+ opencv-contrib-python
16
+ safetensors
17
+ huggingface_hub
18
+ decord
19
+ imageio_ffmpeg==0.6.0
20
+ sageattention==1.0.6
21
+ xformers==0.0.29.post3
22
+ bitsandbytes==0.46.0
23
+ pillow-heif==0.22.0