videodiffusion

Sleeping

App Files Files Community

QinOwen commited on Jul 19, 2024

Commit

2ad9d00

1 Parent(s): 5098655

fix-bug

Browse files

Files changed (6) hide show

VADER-VideoCrafter/scripts/main/train_t2v_lora.py +178 -25
app.py +5 -5
gradio_cached_examples/32/indices.csv +0 -1
gradio_cached_examples/32/log.csv +0 -2
gradio_cached_examples/34/indices.csv +0 -1
gradio_cached_examples/34/log.csv +0 -2

VADER-VideoCrafter/scripts/main/train_t2v_lora.py CHANGED Viewed

@@ -29,7 +29,6 @@ from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
 import hpsv2
 import bitsandbytes as bnb
 from accelerate import Accelerator
-from accelerate.logging import get_logger
 from accelerate.utils import gather_object
 import torch.distributed as dist
 import logging
@@ -43,16 +42,6 @@ import cv2
 # st = ipdb.set_trace
-logger = get_logger(__name__, log_level="INFO") # get logger for current module
-def create_logging(logging, logger, accelerator):
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
 def create_output_folders(output_dir, run_name):
     out_dir = os.path.join(output_dir, run_name)
     os.makedirs(out_dir, exist_ok=True)
@@ -567,12 +556,162 @@ def should_sample(global_step, validation_steps, is_sample_preview):
     and is_sample_preview
 def run_training(args, model, **kwargs):
     ## ---------------------step 1: accelerator setup---------------------------
     accelerator = Accelerator(                                                  # Initialize Accelerator
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         mixed_precision=args.mixed_precision,
-        project_dir=args.project_dir
     )
     output_dir = args.project_dir
@@ -584,6 +723,8 @@ def run_training(args, model, **kwargs):
             lora_dropout=0.01,
         )
     peft_model = peft.get_peft_model(model, config)
     peft_model.print_trainable_parameters()
@@ -599,13 +740,24 @@ def run_training(args, model, **kwargs):
         # load the pretrained LoRA model
         peft.set_peft_model_state_dict(peft_model, torch.load(args.lora_ckpt_path))
-    # Make one log on every process with the configuration for debugging.
-    create_logging(logging, logger, accelerator)
     # Inference Step: only do inference and save the videos. Skip this step if it is training
     # ==================================================================
     if args.inference_only:
         peft_model = accelerator.prepare(peft_model)
         # sample shape
         assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
         # latent noise shape
@@ -618,7 +770,7 @@ def run_training(args, model, **kwargs):
             channels = peft_model.channels
         ## Inference step 2: run Inference over samples
-        logger.info("***** Running inference *****")
         first_epoch = 0
         global_step = 0
@@ -627,10 +779,6 @@ def run_training(args, model, **kwargs):
         ## Inference Step 3: generate new validation videos
         with torch.no_grad():
-            # set random seed for each process
-            random.seed(args.seed)
-            torch.manual_seed(args.seed)
             prompts_all = [args.prompt_str]
             val_prompt = list(prompts_all)
@@ -670,6 +818,8 @@ def run_training(args, model, **kwargs):
                     batch_samples = batch_ddim_sampling(peft_model, cond, noise_shape, args.n_samples, \
                                                             args.ddim_steps, args.ddim_eta, args.unconditional_guidance_scale, None, decode_frame=args.decode_frame, **kwargs)
             # batch_samples: b,samples,c,t,h,w
             dir_name = os.path.join(output_dir, "samples")
             # filenames should be related to the gpu index
@@ -699,7 +849,7 @@ def run_training(args, model, **kwargs):
                     dir_name.extend(results_gathered[i]["dir_name"])
                     prompts.extend(results_gathered[i]["prompt"])
-                logger.info("Validation sample saved!")
             # # batch size is 1, so only one video is generated
@@ -715,12 +865,9 @@ def run_training(args, model, **kwargs):
             torch.cuda.empty_cache()
             gc.collect()
-        return video_path
-    # end of inference only, training script continues
-    # ==================================================================
 def setup_model():
     parser = get_parser()
     args = parser.parse_args()
@@ -747,6 +894,7 @@ def setup_model():
     print("Model setup complete!")
     return model
@@ -777,3 +925,8 @@ def main_fn(prompt, lora_model, lora_rank, seed=200, height=320, width=512, unco
     return video_path

 import hpsv2
 import bitsandbytes as bnb
 from accelerate import Accelerator
 from accelerate.utils import gather_object
 import torch.distributed as dist
 import logging
 # st = ipdb.set_trace
 def create_output_folders(output_dir, run_name):
     out_dir = os.path.join(output_dir, run_name)
     os.makedirs(out_dir, exist_ok=True)
     and is_sample_preview
+# def run_training(args, model, **kwargs):
+#     ## ---------------------step 1: setup---------------------------
+#     output_dir = args.project_dir
+#     # step 2.1: add LoRA using peft
+#     config = peft.LoraConfig(
+#             r=args.lora_rank,
+#             target_modules=["to_k", "to_v", "to_q"],        # only diffusion_model has these modules
+#             lora_dropout=0.01,
+#         )
+#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#     model = model.to(device)
+#     peft_model = peft.get_peft_model(model, config)
+#     # load the pretrained LoRA model
+#     if args.lora_ckpt_path != "Base Model":
+#         if args.lora_ckpt_path == "huggingface-hps-aesthetic":  # download the pretrained LoRA model from huggingface
+#             snapshot_download(repo_id='zheyangqin/VADER', local_dir ='VADER-VideoCrafter/checkpoints/pretrained_lora')
+#             args.lora_ckpt_path = 'VADER-VideoCrafter/checkpoints/pretrained_lora/vader_videocrafter_hps_aesthetic.pt'
+#         elif args.lora_ckpt_path == "huggingface-pickscore":    # download the pretrained LoRA model from huggingface
+#             snapshot_download(repo_id='zheyangqin/VADER', local_dir ='VADER-VideoCrafter/checkpoints/pretrained_lora')
+#             args.lora_ckpt_path = 'VADER-VideoCrafter/checkpoints/pretrained_lora/vader_videocrafter_pickscore.pt'
+#         # load the pretrained LoRA model
+#         peft.set_peft_model_state_dict(peft_model, torch.load(args.lora_ckpt_path))
+#     # peft_model.first_stage_model.to(device)
+#     peft_model.eval()
+#     print("device is: ", device)
+#     print("precision: ", peft_model.dtype)
+#     # precision of first_stage_model
+#     print("precision of first_stage_model: ", peft_model.first_stage_model.dtype)
+#     print("peft_model device: ", peft_model.device)
+#     # Inference Step: only do inference and save the videos. Skip this step if it is training
+#     # ==================================================================
+#     # sample shape
+#     assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
+#     # latent noise shape
+#     h, w = args.height // 8, args.width // 8
+#     frames = peft_model.temporal_length if args.frames < 0 else args.frames
+#     channels = peft_model.channels
+#     ## Inference step 2: run Inference over samples
+#     print("***** Running inference *****")
+#     ## Inference Step 3: generate new validation videos
+#     with torch.no_grad():
+#         # set random seed for each process
+#         random.seed(args.seed)
+#         torch.manual_seed(args.seed)
+#         prompts_all = [args.prompt_str]
+#         val_prompt = list(prompts_all)
+#         assert len(val_prompt) == 1, "Error: only one prompt is allowed for inference in gradio!"
+#         # store output of generations in dict
+#         results=dict(filenames=[],dir_name=[], prompt=[])
+#         # Inference Step 3.1: forward pass
+#         batch_size = len(val_prompt)
+#         noise_shape = [batch_size, channels, frames, h, w]
+#         fps = torch.tensor([args.fps]*batch_size).to(device).long()
+#         prompts = val_prompt
+#         if isinstance(prompts, str):
+#             prompts = [prompts]
+#         # mix precision
+#         if isinstance(peft_model, torch.nn.parallel.DistributedDataParallel):
+#             text_emb = peft_model.module.get_learned_conditioning(prompts).to(device)
+#         else:
+#             text_emb = peft_model.get_learned_conditioning(prompts).to(device)
+#         if args.mode == 'base':
+#             cond = {"c_crossattn": [text_emb], "fps": fps}
+#         else:   # TODO: implement i2v mode training in the future
+#             raise NotImplementedError
+#         # Inference Step 3.2: inference, batch_samples shape: batch, <samples>, c, t, h, w
+#         # no backprop_mode=args.backprop_mode because it is inference process
+#         batch_samples = batch_ddim_sampling(peft_model, cond, noise_shape, args.n_samples, \
+#                                                 args.ddim_steps, args.ddim_eta, args.unconditional_guidance_scale, None, decode_frame=args.decode_frame, **kwargs)
+#         print("batch_samples dtype: ", batch_samples.dtype)
+#         print("batch_samples device: ", batch_samples.device)
+#         # batch_samples: b,samples,c,t,h,w
+#         dir_name = os.path.join(output_dir, "samples")
+#         # filenames should be related to the gpu index
+#         # get timestamps for filenames to avoid overwriting
+#         # current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+#         filenames = [f"temporal"] # only one sample
+#         # if dir_name is not exists, create it
+#         os.makedirs(dir_name, exist_ok=True)
+#         save_videos(batch_samples, dir_name, filenames, fps=args.savefps)
+#         results["filenames"].extend(filenames)
+#         results["dir_name"].extend([dir_name]*len(filenames))
+#         results["prompt"].extend(prompts)
+#         results=[ results ] # transform to list, otherwise gather_object() will not collect correctly
+#         # Inference Step 3.3: collect inference results and save the videos to wandb
+#         # collect inference results from all the GPUs
+#         results_gathered=gather_object(results)
+#         filenames = []
+#         dir_name = []
+#         prompts = []
+#         for i in range(len(results_gathered)):
+#             filenames.extend(results_gathered[i]["filenames"])
+#             dir_name.extend(results_gathered[i]["dir_name"])
+#             prompts.extend(results_gathered[i]["prompt"])
+#         print("Validation sample saved!")
+#         # # batch size is 1, so only one video is generated
+#         # video = get_videos(batch_samples)
+#         # # read the video from the saved path
+#         video_path = os.path.join(dir_name[0], filenames[0]+".mp4")
+#         # release memory
+#         del batch_samples
+#         torch.cuda.empty_cache()
+#         gc.collect()
+#         return video_path
+#     # end of inference only, training script continues
+#     # ==================================================================
 def run_training(args, model, **kwargs):
     ## ---------------------step 1: accelerator setup---------------------------
     accelerator = Accelerator(                                                  # Initialize Accelerator
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         mixed_precision=args.mixed_precision,
+        project_dir=args.project_dir,
+        device_placement=True,
+        cpu=False
     )
     output_dir = args.project_dir
             lora_dropout=0.01,
         )
+    model = model.to(accelerator.device)
     peft_model = peft.get_peft_model(model, config)
     peft_model.print_trainable_parameters()
         # load the pretrained LoRA model
         peft.set_peft_model_state_dict(peft_model, torch.load(args.lora_ckpt_path))
+    print("precision: ", peft_model.dtype)
+    # precision of first_stage_model
+    print("precision of first_stage_model: ", peft_model.first_stage_model.dtype)
+    print("peft_model device: ", peft_model.device)
     # Inference Step: only do inference and save the videos. Skip this step if it is training
     # ==================================================================
     if args.inference_only:
         peft_model = accelerator.prepare(peft_model)
+        print("precision: ", peft_model.dtype)
+        # precision of first_stage_model
+        print("precision of first_stage_model: ", peft_model.first_stage_model.dtype)
+        print("peft_model device: ", peft_model.device)
         # sample shape
         assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
         # latent noise shape
             channels = peft_model.channels
         ## Inference step 2: run Inference over samples
+        print("***** Running inference *****")
         first_epoch = 0
         global_step = 0
         ## Inference Step 3: generate new validation videos
         with torch.no_grad():
             prompts_all = [args.prompt_str]
             val_prompt = list(prompts_all)
                     batch_samples = batch_ddim_sampling(peft_model, cond, noise_shape, args.n_samples, \
                                                             args.ddim_steps, args.ddim_eta, args.unconditional_guidance_scale, None, decode_frame=args.decode_frame, **kwargs)
+                print("batch_samples dtype: ", batch_samples.dtype)
+                print("batch_samples device: ", batch_samples.device)
             # batch_samples: b,samples,c,t,h,w
             dir_name = os.path.join(output_dir, "samples")
             # filenames should be related to the gpu index
                     dir_name.extend(results_gathered[i]["dir_name"])
                     prompts.extend(results_gathered[i]["prompt"])
+                print("Validation sample saved!")
             # # batch size is 1, so only one video is generated
             torch.cuda.empty_cache()
             gc.collect()
+        return video_path
 def setup_model():
     parser = get_parser()
     args = parser.parse_args()
     print("Model setup complete!")
+    print("model dtype: ", model.dtype)
     return model
     return video_path
+# if main
+if __name__ == "__main__":
+    model = setup_model()
+    main_fn("a person walking on the street", "huggingface-hps-aesthetic", 16, 200, 320, 512, 12, 25, 1.0, 24, 10, model=model)

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ examples = [
 model = setup_model()
-@spaces.GPU(duration=70)
 def gradio_main_fn(prompt, lora_model, lora_rank, seed, height, width, unconditional_guidance_scale, ddim_steps, ddim_eta,
                    frames, savefps):
     global model
@@ -203,16 +203,16 @@ with gr.Blocks(css=custom_css) as demo:
             seed = gr.Slider(minimum=0, maximum=65536, label="Seed", step = 1, value=200)
             with gr.Row():
-                height = gr.Slider(minimum=0, maximum=1024, label="Height", step = 16, value=384)
-                width = gr.Slider(minimum=0, maximum=1024, label="Width", step = 16, value=512)
             with gr.Row():
                 frames = gr.Slider(minimum=0, maximum=50, label="Frames", step = 1, value=24)
-                savefps = gr.Slider(minimum=0, maximum=60, label="Save FPS", step = 1, value=10)
             with gr.Row():
-                DDIM_Steps = gr.Slider(minimum=0, maximum=100, label="DDIM Steps", step = 1, value=25)
                 unconditional_guidance_scale = gr.Slider(minimum=0, maximum=50, label="Guidance Scale", step = 0.1, value=12.0)
                 DDIM_Eta = gr.Slider(minimum=0, maximum=1, label="DDIM Eta", step = 0.01, value=1.0)

 model = setup_model()
+@spaces.GPU(duration=120)
 def gradio_main_fn(prompt, lora_model, lora_rank, seed, height, width, unconditional_guidance_scale, ddim_steps, ddim_eta,
                    frames, savefps):
     global model
             seed = gr.Slider(minimum=0, maximum=65536, label="Seed", step = 1, value=200)
             with gr.Row():
+                height = gr.Slider(minimum=0, maximum=512, label="Height", step = 16, value=384)
+                width = gr.Slider(minimum=0, maximum=512, label="Width", step = 16, value=512)
             with gr.Row():
                 frames = gr.Slider(minimum=0, maximum=50, label="Frames", step = 1, value=24)
+                savefps = gr.Slider(minimum=0, maximum=30, label="Save FPS", step = 1, value=10)
             with gr.Row():
+                DDIM_Steps = gr.Slider(minimum=0, maximum=50, label="DDIM Steps", step = 1, value=25)
                 unconditional_guidance_scale = gr.Slider(minimum=0, maximum=50, label="Guidance Scale", step = 0.1, value=12.0)
                 DDIM_Eta = gr.Slider(minimum=0, maximum=1, label="DDIM Eta", step = 0.01, value=1.0)

gradio_cached_examples/32/indices.csv DELETED Viewed

	@@ -1 +0,0 @@
1	- 0

gradio_cached_examples/32/log.csv DELETED Viewed

	@@ -1,2 +0,0 @@
1	- component 0,flag,username,timestamp
2	- "{""video"": {""path"": ""gradio_cached_examples/32/component 0/fd156c6a458fa048724e/temporal.mp4"", ""url"": ""/file=/tmp/gradio/4bc133becbc469de8da700250f7f7df1103c6f56/temporal.mp4"", ""size"": null, ""orig_name"": ""temporal.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-19 00:00:10.509808

gradio_cached_examples/34/indices.csv DELETED Viewed

	@@ -1 +0,0 @@
1	- 0

gradio_cached_examples/34/log.csv DELETED Viewed

	@@ -1,2 +0,0 @@
1	- component 0,flag,username,timestamp
2	- "{""video"": {""path"": ""gradio_cached_examples/34/component 0/d2ac1c9664e80f60d50f/temporal.mp4"", ""url"": ""/file=/tmp/gradio/4bc133becbc469de8da700250f7f7df1103c6f56/temporal.mp4"", ""size"": null, ""orig_name"": ""temporal.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-18 23:33:26.912888