#!/bin/bash set -e -x # export TORCH_LOGS="+dynamo,recompiles,graph_breaks" # export TORCHDYNAMO_VERBOSE=1 # export WANDB_MODE="offline" export WANDB_MODE="disabled" export NCCL_P2P_DISABLE=1 export NCCL_IB_DISABLE=1 export TORCH_NCCL_ENABLE_MONITORING=0 export FINETRAINERS_LOG_LEVEL="DEBUG" BACKEND="ptd" NUM_GPUS=4 CUDA_VISIBLE_DEVICES="0,1,2,3" # Check the JSON files for the expected JSON format DATASET_FILE="examples/inference/flux/dummy_text_to_image.json" # Depending on how many GPUs you have available, choose your degree of parallelism and technique! DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1" DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1" DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1" DDP_8="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 8 --dp_shards 1 --cp_degree 1 --tp_degree 1" CP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 2 --tp_degree 1" CP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 4 --tp_degree 1" # FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1" # FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1" # HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1" # Parallel arguments parallel_cmd=( $CP_4 ) # Model arguments model_cmd=( --model_name "flux" --pretrained_model_name_or_path "black-forest-labs/FLUX.1-dev" --cache_dir /raid/.cache/huggingface --enable_slicing --enable_tiling ) # Inference arguments inference_cmd=( --inference_type text_to_image --dataset_file "$DATASET_FILE" ) # Attention provider arguments attn_provider_cmd=( --attn_provider flash_varlen ) # Torch config arguments torch_config_cmd=( --allow_tf32 --float32_matmul_precision high ) # Miscellaneous arguments miscellaneous_cmd=( --seed 31337 --tracker_name "finetrainers-inference" --output_dir "/raid/aryan/flux-inference" --init_timeout 600 --nccl_timeout 600 --report_to "wandb" ) # Execute the inference script export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun \ --standalone \ --nnodes=1 \ --nproc_per_node=$NUM_GPUS \ --rdzv_backend c10d \ --rdzv_endpoint="localhost:19242" \ examples/inference/inference.py \ "${parallel_cmd[@]}" \ "${model_cmd[@]}" \ "${inference_cmd[@]}" \ "${attn_provider_cmd[@]}" \ "${torch_config_cmd[@]}" \ "${miscellaneous_cmd[@]}" echo -ne "-------------------- Finished executing script --------------------\n\n"