Spaces:
Running
on
Zero
Running
on
Zero
This Pull Request also extends a video & optimizes time & VRAM
#1
by
Fabrice-TIERCELIN
- opened
- .gitattributes +7 -0
- README.md +11 -3
- app.py +0 -0
- app_lora.py +0 -0
- diffusers_helper/bucket_tools.py +74 -1
- diffusers_helper/models/hunyuan_video_packed.py +21 -24
- img_examples/{1.png → Example1.mp4} +2 -2
- img_examples/{2.jpg → Example1.png} +2 -2
- img_examples/{3.png → Example2.webp} +2 -2
- img_examples/Example3.jpg +3 -0
- img_examples/Example4.webp +3 -0
- img_examples/Example5.png +3 -0
- img_examples/Example6.png +3 -0
- requirements.txt +23 -18
.gitattributes
CHANGED
@@ -36,3 +36,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
36 |
img_examples/1.png filter=lfs diff=lfs merge=lfs -text
|
37 |
img_examples/2.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
img_examples/3.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
img_examples/1.png filter=lfs diff=lfs merge=lfs -text
|
37 |
img_examples/2.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
img_examples/3.png filter=lfs diff=lfs merge=lfs -text
|
39 |
+
img_examples/Example1.mp4 filter=lfs diff=lfs merge=lfs -text
|
40 |
+
img_examples/Example1.png filter=lfs diff=lfs merge=lfs -text
|
41 |
+
img_examples/Example2.webp filter=lfs diff=lfs merge=lfs -text
|
42 |
+
img_examples/Example3.jpg filter=lfs diff=lfs merge=lfs -text
|
43 |
+
img_examples/Example4.webp filter=lfs diff=lfs merge=lfs -text
|
44 |
+
img_examples/Example5.png filter=lfs diff=lfs merge=lfs -text
|
45 |
+
img_examples/Example6.png filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -4,11 +4,19 @@ emoji: 📹⚡️
|
|
4 |
colorFrom: pink
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.32.0
|
8 |
-
app_file: app.py
|
9 |
pinned: true
|
|
|
|
|
10 |
license: apache-2.0
|
11 |
-
short_description:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
paper: arxiv:2504.12626
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
4 |
colorFrom: pink
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
|
|
|
|
7 |
pinned: true
|
8 |
+
sdk_version: 5.29.1
|
9 |
+
app_file: app.py
|
10 |
license: apache-2.0
|
11 |
+
short_description: Text-to-Video/Image-to-Video/Video extender (timed prompt)
|
12 |
+
tags:
|
13 |
+
- Image-to-Video
|
14 |
+
- Image-2-Video
|
15 |
+
- Img-to-Vid
|
16 |
+
- Img-2-Vid
|
17 |
+
- language models
|
18 |
+
- LLMs
|
19 |
+
suggested_hardware: zero-a10g
|
20 |
---
|
21 |
paper: arxiv:2504.12626
|
22 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app_lora.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
diffusers_helper/bucket_tools.py
CHANGED
@@ -15,6 +15,79 @@ bucket_options = {
|
|
15 |
(864, 448),
|
16 |
(960, 416),
|
17 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
}
|
19 |
|
20 |
|
@@ -26,5 +99,5 @@ def find_nearest_bucket(h, w, resolution=640):
|
|
26 |
if metric <= min_metric:
|
27 |
min_metric = metric
|
28 |
best_bucket = (bucket_h, bucket_w)
|
|
|
29 |
return best_bucket
|
30 |
-
|
|
|
15 |
(864, 448),
|
16 |
(960, 416),
|
17 |
],
|
18 |
+
672: [
|
19 |
+
(480, 864),
|
20 |
+
(512, 832),
|
21 |
+
(544, 768),
|
22 |
+
(576, 704),
|
23 |
+
(608, 672),
|
24 |
+
(640, 640),
|
25 |
+
(672, 608),
|
26 |
+
(704, 576),
|
27 |
+
(768, 544),
|
28 |
+
(832, 512),
|
29 |
+
(864, 480),
|
30 |
+
],
|
31 |
+
704: [
|
32 |
+
(480, 960),
|
33 |
+
(512, 864),
|
34 |
+
(544, 832),
|
35 |
+
(576, 768),
|
36 |
+
(608, 704),
|
37 |
+
(640, 672),
|
38 |
+
(672, 640),
|
39 |
+
(704, 608),
|
40 |
+
(768, 576),
|
41 |
+
(832, 544),
|
42 |
+
(864, 512),
|
43 |
+
(960, 480),
|
44 |
+
],
|
45 |
+
768: [
|
46 |
+
(512, 960),
|
47 |
+
(544, 864),
|
48 |
+
(576, 832),
|
49 |
+
(608, 768),
|
50 |
+
(640, 704),
|
51 |
+
(672, 672),
|
52 |
+
(704, 640),
|
53 |
+
(768, 608),
|
54 |
+
(832, 576),
|
55 |
+
(864, 544),
|
56 |
+
(960, 512),
|
57 |
+
],
|
58 |
+
832: [
|
59 |
+
(544, 960),
|
60 |
+
(576, 864),
|
61 |
+
(608, 832),
|
62 |
+
(640, 768),
|
63 |
+
(672, 704),
|
64 |
+
(704, 672),
|
65 |
+
(768, 640),
|
66 |
+
(832, 608),
|
67 |
+
(864, 576),
|
68 |
+
(960, 544),
|
69 |
+
],
|
70 |
+
864: [
|
71 |
+
(576, 960),
|
72 |
+
(608, 864),
|
73 |
+
(640, 832),
|
74 |
+
(672, 768),
|
75 |
+
(704, 704),
|
76 |
+
(768, 672),
|
77 |
+
(832, 640),
|
78 |
+
(864, 608),
|
79 |
+
(960, 576),
|
80 |
+
],
|
81 |
+
960: [
|
82 |
+
(608, 960),
|
83 |
+
(640, 864),
|
84 |
+
(672, 832),
|
85 |
+
(704, 768),
|
86 |
+
(768, 704),
|
87 |
+
(832, 672),
|
88 |
+
(864, 640),
|
89 |
+
(960, 608),
|
90 |
+
],
|
91 |
}
|
92 |
|
93 |
|
|
|
99 |
if metric <= min_metric:
|
100 |
min_metric = metric
|
101 |
best_bucket = (bucket_h, bucket_w)
|
102 |
+
print("The resolution of the generated video will be " + str(best_bucket))
|
103 |
return best_bucket
|
|
diffusers_helper/models/hunyuan_video_packed.py
CHANGED
@@ -122,21 +122,17 @@ def attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seq
|
|
122 |
x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
|
123 |
return x
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
v = v.flatten(0, 1)
|
130 |
-
|
131 |
if sageattn_varlen is not None:
|
132 |
x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
|
133 |
elif flash_attn_varlen_func is not None:
|
134 |
x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
|
135 |
else:
|
136 |
raise NotImplementedError('No Attn Installed!')
|
137 |
-
|
138 |
-
x = x.unflatten(0, (B, L))
|
139 |
-
|
140 |
return x
|
141 |
|
142 |
|
@@ -362,7 +358,7 @@ class HunyuanVideoIndividualTokenRefiner(nn.Module):
|
|
362 |
batch_size = attention_mask.shape[0]
|
363 |
seq_len = attention_mask.shape[1]
|
364 |
attention_mask = attention_mask.to(hidden_states.device).bool()
|
365 |
-
self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).
|
366 |
self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
|
367 |
self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
|
368 |
self_attn_mask[:, :, :, 0] = True
|
@@ -930,22 +926,23 @@ class HunyuanVideoTransformer3DModelPacked(ModelMixin, ConfigMixin, PeftAdapterM
|
|
930 |
encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
|
931 |
encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
|
932 |
|
933 |
-
|
934 |
-
|
935 |
-
|
936 |
-
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
-
|
|
|
942 |
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
|
948 |
-
|
949 |
|
950 |
if self.enable_teacache:
|
951 |
modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
|
|
|
122 |
x = torch.nn.functional.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)).transpose(1, 2)
|
123 |
return x
|
124 |
|
125 |
+
batch_size = q.shape[0]
|
126 |
+
q = q.view(q.shape[0] * q.shape[1], *q.shape[2:])
|
127 |
+
k = k.view(k.shape[0] * k.shape[1], *k.shape[2:])
|
128 |
+
v = v.view(v.shape[0] * v.shape[1], *v.shape[2:])
|
|
|
|
|
129 |
if sageattn_varlen is not None:
|
130 |
x = sageattn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
|
131 |
elif flash_attn_varlen_func is not None:
|
132 |
x = flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv)
|
133 |
else:
|
134 |
raise NotImplementedError('No Attn Installed!')
|
135 |
+
x = x.view(batch_size, max_seqlen_q, *x.shape[2:])
|
|
|
|
|
136 |
return x
|
137 |
|
138 |
|
|
|
358 |
batch_size = attention_mask.shape[0]
|
359 |
seq_len = attention_mask.shape[1]
|
360 |
attention_mask = attention_mask.to(hidden_states.device).bool()
|
361 |
+
self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).expand(-1, -1, seq_len, -1)
|
362 |
self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
|
363 |
self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
|
364 |
self_attn_mask[:, :, :, 0] = True
|
|
|
926 |
encoder_hidden_states = torch.cat([extra_encoder_hidden_states, encoder_hidden_states], dim=1)
|
927 |
encoder_attention_mask = torch.cat([extra_attention_mask, encoder_attention_mask], dim=1)
|
928 |
|
929 |
+
with torch.no_grad():
|
930 |
+
if batch_size == 1:
|
931 |
+
# When batch size is 1, we do not need any masks or var-len funcs since cropping is mathematically same to what we want
|
932 |
+
# If they are not same, then their impls are wrong. Ours are always the correct one.
|
933 |
+
text_len = encoder_attention_mask.sum().item()
|
934 |
+
encoder_hidden_states = encoder_hidden_states[:, :text_len]
|
935 |
+
attention_mask = None, None, None, None
|
936 |
+
else:
|
937 |
+
img_seq_len = hidden_states.shape[1]
|
938 |
+
txt_seq_len = encoder_hidden_states.shape[1]
|
939 |
|
940 |
+
cu_seqlens_q = get_cu_seqlens(encoder_attention_mask, img_seq_len)
|
941 |
+
cu_seqlens_kv = cu_seqlens_q
|
942 |
+
max_seqlen_q = img_seq_len + txt_seq_len
|
943 |
+
max_seqlen_kv = max_seqlen_q
|
944 |
|
945 |
+
attention_mask = cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv
|
946 |
|
947 |
if self.enable_teacache:
|
948 |
modulated_inp = self.transformer_blocks[0].norm1(hidden_states, emb=temb)[0]
|
img_examples/{1.png → Example1.mp4}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a906a1d14d1699f67ca54865c7aa5857e55246f4ec63bbaf3edcf359e73bebd1
|
3 |
+
size 240647
|
img_examples/{2.jpg → Example1.png}
RENAMED
File without changes
|
img_examples/{3.png → Example2.webp}
RENAMED
File without changes
|
img_examples/Example3.jpg
ADDED
![]() |
Git LFS Details
|
img_examples/Example4.webp
ADDED
![]() |
Git LFS Details
|
img_examples/Example5.png
ADDED
![]() |
Git LFS Details
|
img_examples/Example6.png
ADDED
![]() |
Git LFS Details
|
requirements.txt
CHANGED
@@ -1,18 +1,23 @@
|
|
1 |
-
accelerate==1.
|
2 |
-
diffusers==0.33.1
|
3 |
-
transformers==4.
|
4 |
-
sentencepiece==0.2.0
|
5 |
-
pillow==11.1
|
6 |
-
av==12.1.0
|
7 |
-
numpy==1.26.2
|
8 |
-
scipy==1.12.0
|
9 |
-
requests==2.
|
10 |
-
torchsde==0.2.6
|
11 |
-
torch>=2.0.0
|
12 |
-
torchvision
|
13 |
-
torchaudio
|
14 |
-
einops
|
15 |
-
opencv-contrib-python
|
16 |
-
safetensors
|
17 |
-
huggingface_hub
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==1.7.0
|
2 |
+
diffusers==0.33.1
|
3 |
+
transformers==4.52.4
|
4 |
+
sentencepiece==0.2.0
|
5 |
+
pillow==11.2.1
|
6 |
+
av==12.1.0
|
7 |
+
numpy==1.26.2
|
8 |
+
scipy==1.12.0
|
9 |
+
requests==2.32.4
|
10 |
+
torchsde==0.2.6
|
11 |
+
torch>=2.0.0
|
12 |
+
torchvision
|
13 |
+
torchaudio
|
14 |
+
einops
|
15 |
+
opencv-contrib-python
|
16 |
+
safetensors
|
17 |
+
huggingface_hub
|
18 |
+
decord
|
19 |
+
imageio_ffmpeg==0.6.0
|
20 |
+
sageattention==1.0.6
|
21 |
+
xformers==0.0.29.post3
|
22 |
+
bitsandbytes==0.46.0
|
23 |
+
pillow-heif==0.22.0
|