4ec0a23e73
Change-Id: Ic4e43992e1674946cb69e0221659b0261259196c
57 lines
2.1 KiB
Bash
57 lines
2.1 KiB
Bash
# -*- coding: utf-8 -*-
|
|
# @Author: Weisen Pan
|
|
|
|
# Load necessary system modules
|
|
source /etc/profile.d/modules.sh
|
|
|
|
# Load the GCC module version 11.2.0
|
|
module load gcc/11.2.0
|
|
|
|
# Load the OpenMPI module version 4.1.3
|
|
module load openmpi/4.1.3
|
|
|
|
# Load the CUDA module version 11.5.2
|
|
module load cuda/11.5/11.5.2
|
|
|
|
# Load the cuDNN module version 8.3.3
|
|
module load cudnn/8.3/8.3.3
|
|
|
|
# Load the NCCL module version 2.11.4-1
|
|
module load nccl/2.11/2.11.4-1
|
|
|
|
# Load the Python module version 3.10.4
|
|
module load python/3.10/3.10.4
|
|
|
|
# Activate the virtual environment for PyTorch and Horovod
|
|
source ~/venv/pytorch1.11+horovod/bin/activate
|
|
|
|
# Set up the log directory and clean previous records if they exist
|
|
LOG_OUTPUT="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
|
|
rm -rf ${LOG_OUTPUT} # Remove previous log files
|
|
mkdir -p ${LOG_OUTPUT} # Create a new directory for logs
|
|
|
|
# Prepare local storage for the dataset by copying it to a local directory
|
|
LOCAL_DATA_DIR="${SGE_LOCALDIR}/${JOB_ID}/"
|
|
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${LOCAL_DATA_DIR}
|
|
|
|
# Navigate to the EdgeFLite project directory
|
|
cd EdgeFLite
|
|
|
|
# Run the federated learning experiment with the specified parameters
|
|
python run_gkt.py \
|
|
--is_fed=1 \ # Enable federated learning
|
|
--fixed_cluster=0 \ # Disable fixed cluster settings
|
|
--split_factor=1 \ # Use split factor of 1
|
|
--num_clusters=20 \ # Set the number of clusters to 20
|
|
--num_selected=20 \ # Select 20 clients for each round
|
|
--arch=resnet_model_110sl \ # Use ResNet110 single branch architecture
|
|
--dataset=cifar100 \ # Use CIFAR-100 dataset
|
|
--num_classes=100 \ # Set the number of classes to 100
|
|
--is_single_branch=0 \ # Use multiple branches in the model
|
|
--is_amp=0 \ # Disable automatic mixed precision
|
|
--num_rounds=650 \ # Set the number of communication rounds to 650
|
|
--fed_epochs=1 \ # Set the number of federated epochs to 1
|
|
--cifar100_non_iid="quantity_skew" \ # Apply non-IID data partitioning (quantity skew)
|
|
--spid="FGKT_R110_20c_skew" \ # Set the experiment ID
|
|
--data=${LOCAL_DATA_DIR} # Set the path to the dataset in local storage
|