4ec0a23e73
Change-Id: Ic4e43992e1674946cb69e0221659b0261259196c
61 lines
3.5 KiB
Bash
61 lines
3.5 KiB
Bash
# -*- coding: utf-8 -*-
|
|
# @Author: Weisen Pan
|
|
|
|
# Load necessary environment modules
|
|
source /etc/profile.d/modules.sh # Source the module environment setup script
|
|
module load gcc/11.2.0 # Load GCC compiler version 11.2.0
|
|
module load openmpi/4.1.3 # Load OpenMPI version 4.1.3 for distributed computing
|
|
module load cuda/11.5/11.5.2 # Load CUDA version 11.5.2 for GPU acceleration
|
|
module load cudnn/8.3/8.3.3 # Load cuDNN version 8.3.3 for deep learning operations
|
|
module load nccl/2.11/2.11.4-1 # Load NCCL version 2.11 for multi-GPU communication
|
|
module load python/3.10/3.10.4 # Load Python version 3.10.4
|
|
|
|
# Activate the Python virtual environment with PyTorch and Horovod installed
|
|
source ~/venv/pytorch1.11+horovod/bin/activate
|
|
|
|
# Setup the log directory for the experiment
|
|
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}" # Define the log path
|
|
rm -rf ${LOG_PATH} # Remove any existing logs in the directory
|
|
mkdir -p ${LOG_PATH} # Create the log directory if it doesn't exist
|
|
|
|
# Setup the dataset directory, copying data for local use
|
|
DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/" # Define the local directory for the dataset
|
|
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_PATH} # Copy CIFAR-100 dataset to local storage
|
|
|
|
# Set experiment parameters for federated learning
|
|
OUTPUT_DIR="./EdgeFLite/models/coremodel/" # Directory where model checkpoints will be saved
|
|
FED_MODE=1 # Federated learning mode enabled
|
|
CLUSTER_FIXED=0 # Cluster dynamic, not fixed
|
|
SPLIT_RATIO=4 # Split the dataset into 4 parts
|
|
TOTAL_CLUSTERS=20 # Number of clusters (e.g., number of different clients in federated learning)
|
|
SELECTED_CLIENTS=20 # Number of clients selected per round
|
|
MODEL_ARCH="resnet_model_110sl" # Model architecture to be used (ResNet-110 with some custom changes)
|
|
DATASET_NAME="cifar100" # Dataset being used (CIFAR-100)
|
|
NUM_CLASS_LABELS=100 # Number of class labels in the dataset (CIFAR-100 has 100 classes)
|
|
SINGLE_BRANCH=0 # Multi-branch model architecture (not single-branch)
|
|
AMP_MODE=0 # Disable Automatic Mixed Precision (AMP) for training
|
|
ROUNDS=650 # Total number of federated learning rounds
|
|
EPOCHS_PER_ROUND=1 # Number of local epochs per round of federated learning
|
|
EXP_ID="EdgeFLite_R110_80c_650r" # Experiment ID for tracking
|
|
|
|
# Navigate to the project directory
|
|
cd EdgeFLite # Change to the EdgeFLite project directory
|
|
|
|
# Execute the training process for federated learning with the defined parameters
|
|
python train_EdgeFLite.py \
|
|
--is_fed=${FED_MODE} # Enable federated learning mode
|
|
--fixed_cluster=${CLUSTER_FIXED} # Use dynamic clusters
|
|
--split_factor=${SPLIT_RATIO} # Set the dataset split ratio
|
|
--num_clusters=${TOTAL_CLUSTERS} # Total number of clusters (clients)
|
|
--num_selected=${SELECTED_CLIENTS} # Number of clients selected per federated round
|
|
--arch=${MODEL_ARCH} # Set model architecture (ResNet-110 variant)
|
|
--dataset=${DATASET_NAME} # Dataset name (CIFAR-100)
|
|
--num_classes=${NUM_CLASS_LABELS} # Number of classes in the dataset
|
|
--is_single_branch=${SINGLE_BRANCH} # Use multi-branch model (set to 0)
|
|
--is_amp=${AMP_MODE} # Disable automatic mixed precision
|
|
--num_rounds=${ROUNDS} # Total number of rounds for federated learning
|
|
--fed_epochs=${EPOCHS_PER_ROUND} # Number of local epochs per round
|
|
--spid=${EXP_ID} # Set experiment ID for tracking
|
|
--data=${DATA_PATH} # Provide dataset path
|
|
--model_dir=${OUTPUT_DIR} # Directory where the model will be saved
|