rajux75 commited on
Commit
752fcf6
·
verified ·
1 Parent(s): e34dbc4

Upload 7 files

Browse files
diffusers_helper/bucket_tools.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bucket_options = {
2
+ 640: [
3
+ (416, 960),
4
+ (448, 864),
5
+ (480, 832),
6
+ (512, 768),
7
+ (544, 704),
8
+ (576, 672),
9
+ (608, 640),
10
+ (640, 608),
11
+ (672, 576),
12
+ (704, 544),
13
+ (768, 512),
14
+ (832, 480),
15
+ (864, 448),
16
+ (960, 416),
17
+ ],
18
+ }
19
+
20
+
21
+ def find_nearest_bucket(h, w, resolution=640):
22
+ min_metric = float('inf')
23
+ best_bucket = None
24
+ for (bucket_h, bucket_w) in bucket_options[resolution]:
25
+ metric = abs(h * bucket_w - w * bucket_h)
26
+ if metric <= min_metric:
27
+ min_metric = metric
28
+ best_bucket = (bucket_h, bucket_w)
29
+ return best_bucket
30
+
diffusers_helper/clip_vision.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def hf_clip_vision_encode(image, feature_extractor, image_encoder):
5
+ assert isinstance(image, np.ndarray)
6
+ assert image.ndim == 3 and image.shape[2] == 3
7
+ assert image.dtype == np.uint8
8
+
9
+ preprocessed = feature_extractor.preprocess(images=image, return_tensors="pt").to(device=image_encoder.device, dtype=image_encoder.dtype)
10
+ image_encoder_output = image_encoder(**preprocessed)
11
+
12
+ return image_encoder_output
diffusers_helper/dit_common.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import accelerate.accelerator
3
+
4
+ from diffusers.models.normalization import RMSNorm, LayerNorm, FP32LayerNorm, AdaLayerNormContinuous
5
+
6
+
7
+ accelerate.accelerator.convert_outputs_to_fp32 = lambda x: x
8
+
9
+
10
+ def LayerNorm_forward(self, x):
11
+ return torch.nn.functional.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps).to(x)
12
+
13
+
14
+ LayerNorm.forward = LayerNorm_forward
15
+ torch.nn.LayerNorm.forward = LayerNorm_forward
16
+
17
+
18
+ def FP32LayerNorm_forward(self, x):
19
+ origin_dtype = x.dtype
20
+ return torch.nn.functional.layer_norm(
21
+ x.float(),
22
+ self.normalized_shape,
23
+ self.weight.float() if self.weight is not None else None,
24
+ self.bias.float() if self.bias is not None else None,
25
+ self.eps,
26
+ ).to(origin_dtype)
27
+
28
+
29
+ FP32LayerNorm.forward = FP32LayerNorm_forward
30
+
31
+
32
+ def RMSNorm_forward(self, hidden_states):
33
+ input_dtype = hidden_states.dtype
34
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
35
+ hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
36
+
37
+ if self.weight is None:
38
+ return hidden_states.to(input_dtype)
39
+
40
+ return hidden_states.to(input_dtype) * self.weight.to(input_dtype)
41
+
42
+
43
+ RMSNorm.forward = RMSNorm_forward
44
+
45
+
46
+ def AdaLayerNormContinuous_forward(self, x, conditioning_embedding):
47
+ emb = self.linear(self.silu(conditioning_embedding))
48
+ scale, shift = emb.chunk(2, dim=1)
49
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
50
+ return x
51
+
52
+
53
+ AdaLayerNormContinuous.forward = AdaLayerNormContinuous_forward
diffusers_helper/hf_login.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ def login(token):
5
+ from huggingface_hub import login
6
+ import time
7
+
8
+ while True:
9
+ try:
10
+ login(token)
11
+ print('HF login ok.')
12
+ break
13
+ except Exception as e:
14
+ print(f'HF login failed: {e}. Retrying')
15
+ time.sleep(0.5)
16
+
17
+
18
+ hf_token = os.environ.get('HF_TOKEN', None)
19
+
20
+ if hf_token is not None:
21
+ login(hf_token)
diffusers_helper/hunyuan.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from diffusers.pipelines.hunyuan_video.pipeline_hunyuan_video import DEFAULT_PROMPT_TEMPLATE
4
+ from diffusers_helper.utils import crop_or_pad_yield_mask
5
+
6
+
7
+ @torch.no_grad()
8
+ def encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2, max_length=256):
9
+ assert isinstance(prompt, str)
10
+
11
+ prompt = [prompt]
12
+
13
+ # LLAMA
14
+
15
+ prompt_llama = [DEFAULT_PROMPT_TEMPLATE["template"].format(p) for p in prompt]
16
+ crop_start = DEFAULT_PROMPT_TEMPLATE["crop_start"]
17
+
18
+ llama_inputs = tokenizer(
19
+ prompt_llama,
20
+ padding="max_length",
21
+ max_length=max_length + crop_start,
22
+ truncation=True,
23
+ return_tensors="pt",
24
+ return_length=False,
25
+ return_overflowing_tokens=False,
26
+ return_attention_mask=True,
27
+ )
28
+
29
+ llama_input_ids = llama_inputs.input_ids.to(text_encoder.device)
30
+ llama_attention_mask = llama_inputs.attention_mask.to(text_encoder.device)
31
+ llama_attention_length = int(llama_attention_mask.sum())
32
+
33
+ llama_outputs = text_encoder(
34
+ input_ids=llama_input_ids,
35
+ attention_mask=llama_attention_mask,
36
+ output_hidden_states=True,
37
+ )
38
+
39
+ llama_vec = llama_outputs.hidden_states[-3][:, crop_start:llama_attention_length]
40
+ # llama_vec_remaining = llama_outputs.hidden_states[-3][:, llama_attention_length:]
41
+ llama_attention_mask = llama_attention_mask[:, crop_start:llama_attention_length]
42
+
43
+ assert torch.all(llama_attention_mask.bool())
44
+
45
+ # CLIP
46
+
47
+ clip_l_input_ids = tokenizer_2(
48
+ prompt,
49
+ padding="max_length",
50
+ max_length=77,
51
+ truncation=True,
52
+ return_overflowing_tokens=False,
53
+ return_length=False,
54
+ return_tensors="pt",
55
+ ).input_ids
56
+ clip_l_pooler = text_encoder_2(clip_l_input_ids.to(text_encoder_2.device), output_hidden_states=False).pooler_output
57
+
58
+ return llama_vec, clip_l_pooler
59
+
60
+
61
+ @torch.no_grad()
62
+ def vae_decode_fake(latents):
63
+ latent_rgb_factors = [
64
+ [-0.0395, -0.0331, 0.0445],
65
+ [0.0696, 0.0795, 0.0518],
66
+ [0.0135, -0.0945, -0.0282],
67
+ [0.0108, -0.0250, -0.0765],
68
+ [-0.0209, 0.0032, 0.0224],
69
+ [-0.0804, -0.0254, -0.0639],
70
+ [-0.0991, 0.0271, -0.0669],
71
+ [-0.0646, -0.0422, -0.0400],
72
+ [-0.0696, -0.0595, -0.0894],
73
+ [-0.0799, -0.0208, -0.0375],
74
+ [0.1166, 0.1627, 0.0962],
75
+ [0.1165, 0.0432, 0.0407],
76
+ [-0.2315, -0.1920, -0.1355],
77
+ [-0.0270, 0.0401, -0.0821],
78
+ [-0.0616, -0.0997, -0.0727],
79
+ [0.0249, -0.0469, -0.1703]
80
+ ] # From comfyui
81
+
82
+ latent_rgb_factors_bias = [0.0259, -0.0192, -0.0761]
83
+
84
+ weight = torch.tensor(latent_rgb_factors, device=latents.device, dtype=latents.dtype).transpose(0, 1)[:, :, None, None, None]
85
+ bias = torch.tensor(latent_rgb_factors_bias, device=latents.device, dtype=latents.dtype)
86
+
87
+ images = torch.nn.functional.conv3d(latents, weight, bias=bias, stride=1, padding=0, dilation=1, groups=1)
88
+ images = images.clamp(0.0, 1.0)
89
+
90
+ return images
91
+
92
+
93
+ @torch.no_grad()
94
+ def vae_decode(latents, vae, image_mode=False):
95
+ latents = latents / vae.config.scaling_factor
96
+
97
+ if not image_mode:
98
+ image = vae.decode(latents.to(device=vae.device, dtype=vae.dtype)).sample
99
+ else:
100
+ latents = latents.to(device=vae.device, dtype=vae.dtype).unbind(2)
101
+ image = [vae.decode(l.unsqueeze(2)).sample for l in latents]
102
+ image = torch.cat(image, dim=2)
103
+
104
+ return image
105
+
106
+
107
+ @torch.no_grad()
108
+ def vae_encode(image, vae):
109
+ latents = vae.encode(image.to(device=vae.device, dtype=vae.dtype)).latent_dist.sample()
110
+ latents = latents * vae.config.scaling_factor
111
+ return latents
diffusers_helper/memory.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # By lllyasviel
2
+
3
+
4
+ import torch
5
+
6
+
7
+ cpu = torch.device('cpu')
8
+ gpu = torch.device(f'cuda:{torch.cuda.current_device()}')
9
+ gpu_complete_modules = []
10
+
11
+
12
+ class DynamicSwapInstaller:
13
+ @staticmethod
14
+ def _install_module(module: torch.nn.Module, **kwargs):
15
+ original_class = module.__class__
16
+ module.__dict__['forge_backup_original_class'] = original_class
17
+
18
+ def hacked_get_attr(self, name: str):
19
+ if '_parameters' in self.__dict__:
20
+ _parameters = self.__dict__['_parameters']
21
+ if name in _parameters:
22
+ p = _parameters[name]
23
+ if p is None:
24
+ return None
25
+ if p.__class__ == torch.nn.Parameter:
26
+ return torch.nn.Parameter(p.to(**kwargs), requires_grad=p.requires_grad)
27
+ else:
28
+ return p.to(**kwargs)
29
+ if '_buffers' in self.__dict__:
30
+ _buffers = self.__dict__['_buffers']
31
+ if name in _buffers:
32
+ return _buffers[name].to(**kwargs)
33
+ return super(original_class, self).__getattr__(name)
34
+
35
+ module.__class__ = type('DynamicSwap_' + original_class.__name__, (original_class,), {
36
+ '__getattr__': hacked_get_attr,
37
+ })
38
+
39
+ return
40
+
41
+ @staticmethod
42
+ def _uninstall_module(module: torch.nn.Module):
43
+ if 'forge_backup_original_class' in module.__dict__:
44
+ module.__class__ = module.__dict__.pop('forge_backup_original_class')
45
+ return
46
+
47
+ @staticmethod
48
+ def install_model(model: torch.nn.Module, **kwargs):
49
+ for m in model.modules():
50
+ DynamicSwapInstaller._install_module(m, **kwargs)
51
+ return
52
+
53
+ @staticmethod
54
+ def uninstall_model(model: torch.nn.Module):
55
+ for m in model.modules():
56
+ DynamicSwapInstaller._uninstall_module(m)
57
+ return
58
+
59
+
60
+ def fake_diffusers_current_device(model: torch.nn.Module, target_device: torch.device):
61
+ if hasattr(model, 'scale_shift_table'):
62
+ model.scale_shift_table.data = model.scale_shift_table.data.to(target_device)
63
+ return
64
+
65
+ for k, p in model.named_modules():
66
+ if hasattr(p, 'weight'):
67
+ p.to(target_device)
68
+ return
69
+
70
+
71
+ def get_cuda_free_memory_gb(device=None):
72
+ if device is None:
73
+ device = gpu
74
+
75
+ memory_stats = torch.cuda.memory_stats(device)
76
+ bytes_active = memory_stats['active_bytes.all.current']
77
+ bytes_reserved = memory_stats['reserved_bytes.all.current']
78
+ bytes_free_cuda, _ = torch.cuda.mem_get_info(device)
79
+ bytes_inactive_reserved = bytes_reserved - bytes_active
80
+ bytes_total_available = bytes_free_cuda + bytes_inactive_reserved
81
+ return bytes_total_available / (1024 ** 3)
82
+
83
+
84
+ def move_model_to_device_with_memory_preservation(model, target_device, preserved_memory_gb=0):
85
+ print(f'Moving {model.__class__.__name__} to {target_device} with preserved memory: {preserved_memory_gb} GB')
86
+
87
+ for m in model.modules():
88
+ if get_cuda_free_memory_gb(target_device) <= preserved_memory_gb:
89
+ torch.cuda.empty_cache()
90
+ return
91
+
92
+ if hasattr(m, 'weight'):
93
+ m.to(device=target_device)
94
+
95
+ model.to(device=target_device)
96
+ torch.cuda.empty_cache()
97
+ return
98
+
99
+
100
+ def offload_model_from_device_for_memory_preservation(model, target_device, preserved_memory_gb=0):
101
+ print(f'Offloading {model.__class__.__name__} from {target_device} to preserve memory: {preserved_memory_gb} GB')
102
+
103
+ for m in model.modules():
104
+ if get_cuda_free_memory_gb(target_device) >= preserved_memory_gb:
105
+ torch.cuda.empty_cache()
106
+ return
107
+
108
+ if hasattr(m, 'weight'):
109
+ m.to(device=cpu)
110
+
111
+ model.to(device=cpu)
112
+ torch.cuda.empty_cache()
113
+ return
114
+
115
+
116
+ def unload_complete_models(*args):
117
+ for m in gpu_complete_modules + list(args):
118
+ m.to(device=cpu)
119
+ print(f'Unloaded {m.__class__.__name__} as complete.')
120
+
121
+ gpu_complete_modules.clear()
122
+ torch.cuda.empty_cache()
123
+ return
124
+
125
+
126
+ def load_model_as_complete(model, target_device, unload=True):
127
+ if unload:
128
+ unload_complete_models()
129
+
130
+ model.to(device=target_device)
131
+ print(f'Loaded {model.__class__.__name__} to {target_device} as complete.')
132
+
133
+ gpu_complete_modules.append(model)
134
+ return
diffusers_helper/thread_utils.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ from threading import Thread, Lock
4
+
5
+
6
+ class Listener:
7
+ task_queue = []
8
+ lock = Lock()
9
+ thread = None
10
+
11
+ @classmethod
12
+ def _process_tasks(cls):
13
+ while True:
14
+ task = None
15
+ with cls.lock:
16
+ if cls.task_queue:
17
+ task = cls.task_queue.pop(0)
18
+
19
+ if task is None:
20
+ time.sleep(0.001)
21
+ continue
22
+
23
+ func, args, kwargs = task
24
+ try:
25
+ func(*args, **kwargs)
26
+ except Exception as e:
27
+ print(f"Error in listener thread: {e}")
28
+
29
+ @classmethod
30
+ def add_task(cls, func, *args, **kwargs):
31
+ with cls.lock:
32
+ cls.task_queue.append((func, args, kwargs))
33
+
34
+ if cls.thread is None:
35
+ cls.thread = Thread(target=cls._process_tasks, daemon=True)
36
+ cls.thread.start()
37
+
38
+
39
+ def async_run(func, *args, **kwargs):
40
+ Listener.add_task(func, *args, **kwargs)
41
+
42
+
43
+ class FIFOQueue:
44
+ def __init__(self):
45
+ self.queue = []
46
+ self.lock = Lock()
47
+
48
+ def push(self, item):
49
+ with self.lock:
50
+ self.queue.append(item)
51
+
52
+ def pop(self):
53
+ with self.lock:
54
+ if self.queue:
55
+ return self.queue.pop(0)
56
+ return None
57
+
58
+ def top(self):
59
+ with self.lock:
60
+ if self.queue:
61
+ return self.queue[0]
62
+ return None
63
+
64
+ def next(self):
65
+ while True:
66
+ with self.lock:
67
+ if self.queue:
68
+ return self.queue.pop(0)
69
+
70
+ time.sleep(0.001)
71
+
72
+
73
+ class AsyncStream:
74
+ def __init__(self):
75
+ self.input_queue = FIFOQueue()
76
+ self.output_queue = FIFOQueue()