123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168 |
- mkdir -p log
- mkdir -p log/mas
- now=$(date +"%Y%m%d_%H%M%S")
- root_dir=/mnt/lustre/$(whoami)
- project_dir=$root_dir/easyfl/applications/mas
- data_dir=$root_dir/datasets/taskonomy_datasets
- client_file=$project_dir/clients.txt
- export PYTHONPATH=$PYTHONPATH:${pwd}
- while [[ "$#" -gt 0 ]]; do
- case $1 in
- -p) partition="$2"; shift ;;
- -t) tasks="$2"; shift ;;
- -a) arch="$2"; shift ;;
- -e) local_epoch="$2"; shift ;;
- -k) clients_per_round="$2"; shift ;;
- -b) batch_size="$2"; shift ;;
- -r) rounds="$2"; shift ;;
- -lr) lr="$2"; shift ;;
- -lrt) lr_type="$2"; shift ;;
- -te) test_every="$2"; shift ;;
- -se) save_model_every="$2"; shift ;;
- -gpus) gpus="$2"; shift ;;
- -count) run_count="$2"; shift ;;
- -port) dist_port="$2"; shift ;;
- -tag) tag="$2"; shift ;;
- -tag_step) tag_step="$2"; shift ;;
- -what) what="$2"; shift ;;
- -client_id) client_id="$2"; shift ;;
- -agg_strategy) agg_strategy="$2"; shift ;;
- -pretrained) pretrained="$2"; shift ;;
- -pt) pretrained_tasks="$2"; shift ;;
- -decoder) decoder="$2"; shift ;;
- -half) half="$2"; shift ;;
- *) echo "Unknown parameter passed: $1"; exit 1 ;;
- esac
- shift
- done
- if [ -z "${partition}" ]
- then
- partition=partition
- fi
- if [ -z "${tasks}" ]
- then
- tasks=""
- fi
- if [ -z "${arch}" ]
- then
- arch=xception # options: xception, resnet18
- fi
- if [ -z "${local_epoch}" ]
- then
- local_epoch=5
- fi
- if [ -z "${clients_per_round}" ]
- then
- clients_per_round=5
- fi
- if [ -z "${batch_size}" ]
- then
- batch_size=64
- fi
- if [ -z "${lr}" ]
- then
- lr=0.1
- fi
- if [ -z "${lr_type}" ]
- then
- lr_type=poly
- fi
- if [ -z "${rounds}" ]
- then
- rounds=100
- fi
- if [ -z "${test_every}" ]
- then
- test_every=1
- fi
- if [ -z "${save_model_every}" ]
- then
- save_model_every=1
- fi
- if [ -z "${gpus}" ]
- then
- gpus=1
- fi
- if [ -z "${dist_port}" ]
- then
- dist_port=23344
- fi
- # Whether use task affinity grouping (lookahead)
- if [ -z "${tag}" ]
- then
- tag='y'
- fi
- # Lookahead step
- if [ -z "${tag_step}" ]
- then
- tag_step=10
- fi
- if [ -z "${run_count}" ]
- then
- run_count=0
- fi
- if [ -z "${client_id}" ]
- then
- client_id='NA'
- fi
- if [ -z "${agg_strategy}" ]
- then
- agg_strategy='FedAvg'
- fi
- if [ -z "${pretrained_tasks}" ]
- then
- pretrained_tasks='sdnkt'
- fi
- use_pretrained='y'
- if [ -z "${pretrained}" ]
- then
- pretrained='n'
- use_pretrained='n'
- pretrained_tasks='n'
- fi
- if [ -z "${decoder}" ]
- then
- decoder='y'
- fi
- if [ -z "${half}" ]
- then
- half='n'
- fi
- job_name=mas-${tasks}-${arch}-b${batch_size}-${lr_type}lr${lr}-${agg_strategy}-tag-${tag}-${tag_step}-e${local_epoch}-n${clients_per_round}-r${rounds}-te${test_every}-se${save_model_every}-pretrained-${use_pretrained}-${pretrained_tasks}-${what}-${run_count}
- echo ${job_name}
- srun -u --partition=${partition} --job-name=${job_name} \
- -n${gpus} --gres=gpu:${gpus} --ntasks-per-node=${gpus} \
- python ${project_dir}/main.py --data_dir ${data_dir} --arch ${arch} --client_file ${client_file} \
- --task_id ${job_name} --tasks ${tasks} --rotate_loss --batch_size ${batch_size} --lr ${lr} --lr_type ${lr_type} \
- --local_epoch ${local_epoch} --clients_per_round ${clients_per_round} --rounds ${rounds} \
- --test_every ${test_every} --save_model_every ${save_model_every} --random_selection --lookahead ${tag} --lookahead_step ${tag_step} \
- --dist_port ${dist_port} --run_count ${run_count} --load_decoder ${decoder} --half ${half} \
- --aggregation_strategy ${agg_strategy} --pretrained ${pretrained} --pretrained_tasks ${pretrained_tasks} \
- --client_id ${client_id} 2>&1 | tee log/mas/${job_name}.log &
|