Spaces:
No application file
No application file
File size: 2,759 Bytes
b26e93d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
#!/bin/bash
# Function to display the menu for selecting model size
select_model_size() {
echo "Select model size:"
select size in s m l x; do
case $size in
s|m|l|x)
echo "You selected model size: $size"
MODEL_SIZE=$size
break
;;
*)
echo "Invalid selection. Please try again."
;;
esac
done
}
# Function to display the menu for selecting task
select_task() {
echo "Select task:"
select task in obj365 obj2coco coco; do
case $task in
obj365|obj2coco|coco)
echo "You selected task: $task"
TASK=$task
break
;;
*)
echo "Invalid selection. Please try again."
;;
esac
done
}
# Function to ask if the user wants to save logs to a txt file
ask_save_logs() {
while true; do
read -p "Do you want to save logs to a txt file? (y/n): " yn
case $yn in
[Yy]* )
SAVE_LOGS=true
break
;;
[Nn]* )
SAVE_LOGS=false
break
;;
* ) echo "Please answer yes or no.";;
esac
done
}
# Call the functions to let the user select
select_model_size
select_task
ask_save_logs
# Set config file and output directory based on selection
if [ "$TASK" = "coco" ]; then
CONFIG_FILE="configs/dfine/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
else
CONFIG_FILE="configs/dfine/objects365/dfine_hgnetv2_${MODEL_SIZE}_${TASK}.yml"
fi
OUTPUT_DIR="output/${MODEL_SIZE}_${TASK}"
# Construct the training command
TRAIN_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR"
# Append log redirection if SAVE_LOGS is true
if [ "$SAVE_LOGS" = true ]; then
LOG_FILE="${MODEL_SIZE}_${TASK}.txt"
TRAIN_CMD="$TRAIN_CMD &> \"$LOG_FILE\" 2>&1 &"
else
TRAIN_CMD="$TRAIN_CMD &"
fi
# Run the training command
eval $TRAIN_CMD
if [ $? -ne 0 ]; then
echo "First training failed, restarting with resume option..."
while true; do
RESUME_CMD="CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --master_port=7777 --nproc_per_node=4 train.py -c $CONFIG_FILE --use-amp --seed=0 --output-dir $OUTPUT_DIR -r ${OUTPUT_DIR}/last.pth"
if [ "$SAVE_LOGS" = true ]; then
LOG_FILE="${MODEL_SIZE}_${TASK}_2.txt"
RESUME_CMD="$RESUME_CMD &> \"$LOG_FILE\" 2>&1 &"
else
RESUME_CMD="$RESUME_CMD &"
fi
eval $RESUME_CMD
if [ $? -eq 0 ]; then
break
fi
done
fi
|