File size: 2,231 Bytes
3a6a639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import torch

from diffusers import CosmosTextToWorldPipeline, CosmosTransformer3DModel
from diffusers.utils import export_to_video
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
from transformers import AutoModel, T5EncoderModel

model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Text2World"


# 4bit ์„ค์ •
bnb_config = DiffusersBitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # ๊ณ„์‚ฐ์€ fp16์œผ๋กœ
    bnb_4bit_quant_type="nf4",             # NF4๋Š” ๋” ์ •๋ฐ€ํ•จ
    bnb_4bit_use_double_quant=True         # ๋” ์ž‘์€ ๋ชจ๋ธ์€ double quant ์ถ”์ฒœ
)


# 4bit๋กœ text_encoder ๋กœ๋“œ
text_encoder = T5EncoderModel.from_pretrained(
    model_id,
    subfolder="text_encoder",
    device_map="auto",                    # ์ž๋™ GPU ๋ฐฐ์น˜
    quantization_config=bnb_config
)

quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
transformer = CosmosTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=quant_config,
    torch_dtype=torch.bfloat16,
)



pipe = CosmosTextToWorldPipeline.from_pretrained(
    model_id,
    text_encoder=text_encoder,
    transformers=transformer,
    torch_dtype=torch.bfloat16  # optional
).to("cuda")

#pipe.to("cuda")
#pipe.enable_model_cpu_offload()
#pipe.enable_vae_slicing()

prompt = "A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."

output = pipe(prompt=prompt,width=960,height=704).frames[0]

export_to_video(output, "output.mp4", fps=30)