rippertnt's picture
Upload 6 files
3a6a639 verified
import torch
from diffusers import CosmosTextToWorldPipeline, CosmosTransformer3DModel
from diffusers.utils import export_to_video
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
from transformers import AutoModel, T5EncoderModel
model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Text2World"
# 4bit ์„ค์ •
bnb_config = DiffusersBitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16, # ๊ณ„์‚ฐ์€ fp16์œผ๋กœ
bnb_4bit_quant_type="nf4", # NF4๋Š” ๋” ์ •๋ฐ€ํ•จ
bnb_4bit_use_double_quant=True # ๋” ์ž‘์€ ๋ชจ๋ธ์€ double quant ์ถ”์ฒœ
)
# 4bit๋กœ text_encoder ๋กœ๋“œ
text_encoder = T5EncoderModel.from_pretrained(
model_id,
subfolder="text_encoder",
device_map="auto", # ์ž๋™ GPU ๋ฐฐ์น˜
quantization_config=bnb_config
)
quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)
transformer = CosmosTransformer3DModel.from_pretrained(
model_id,
subfolder="transformer",
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
pipe = CosmosTextToWorldPipeline.from_pretrained(
model_id,
text_encoder=text_encoder,
transformers=transformer,
torch_dtype=torch.bfloat16 # optional
).to("cuda")
#pipe.to("cuda")
#pipe.enable_model_cpu_offload()
#pipe.enable_vae_slicing()
prompt = "A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
output = pipe(prompt=prompt,width=960,height=704).frames[0]
export_to_video(output, "output.mp4", fps=30)