use-case-and-architecture/EdgeFLite/scripts/EdgeFLite_R110_80c_650r.sh

# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Load necessary environment modules
source /etc/profile.d/modules.sh  # Source the module environment setup script
module load gcc/11.2.0           # Load GCC compiler version 11.2.0
module load openmpi/4.1.3        # Load OpenMPI version 4.1.3 for distributed computing
module load cuda/11.5/11.5.2     # Load CUDA version 11.5.2 for GPU acceleration
module load cudnn/8.3/8.3.3      # Load cuDNN version 8.3.3 for deep learning operations
module load nccl/2.11/2.11.4-1   # Load NCCL version 2.11 for multi-GPU communication
module load python/3.10/3.10.4   # Load Python version 3.10.4

# Activate the Python virtual environment with PyTorch and Horovod installed
source ~/venv/pytorch1.11+horovod/bin/activate

# Setup the log directory for the experiment
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"  # Define the log path
rm -rf ${LOG_PATH}  # Remove any existing logs in the directory
mkdir -p ${LOG_PATH}  # Create the log directory if it doesn't exist

# Setup the dataset directory, copying data for local use
DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/"  # Define the local directory for the dataset
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_PATH}  # Copy CIFAR-100 dataset to local storage

# Set experiment parameters for federated learning
OUTPUT_DIR="./EdgeFLite/models/coremodel/"  # Directory where model checkpoints will be saved
FED_MODE=1  # Federated learning mode enabled
CLUSTER_FIXED=0  # Cluster dynamic, not fixed
SPLIT_RATIO=4  # Split the dataset into 4 parts
TOTAL_CLUSTERS=20  # Number of clusters (e.g., number of different clients in federated learning)
SELECTED_CLIENTS=20  # Number of clients selected per round
MODEL_ARCH="resnet_model_110sl"  # Model architecture to be used (ResNet-110 with some custom changes)
DATASET_NAME="cifar100"  # Dataset being used (CIFAR-100)
NUM_CLASS_LABELS=100  # Number of class labels in the dataset (CIFAR-100 has 100 classes)
SINGLE_BRANCH=0  # Multi-branch model architecture (not single-branch)
AMP_MODE=0  # Disable Automatic Mixed Precision (AMP) for training
ROUNDS=650  # Total number of federated learning rounds
EPOCHS_PER_ROUND=1  # Number of local epochs per round of federated learning
EXP_ID="EdgeFLite_R110_80c_650r"  # Experiment ID for tracking

# Navigate to the project directory
cd EdgeFLite  # Change to the EdgeFLite project directory

# Execute the training process for federated learning with the defined parameters
python train_EdgeFLite.py \
    --is_fed=${FED_MODE}  # Enable federated learning mode
    --fixed_cluster=${CLUSTER_FIXED}  # Use dynamic clusters
    --split_factor=${SPLIT_RATIO}  # Set the dataset split ratio
    --num_clusters=${TOTAL_CLUSTERS}  # Total number of clusters (clients)
    --num_selected=${SELECTED_CLIENTS}  # Number of clients selected per federated round
    --arch=${MODEL_ARCH}  # Set model architecture (ResNet-110 variant)
    --dataset=${DATASET_NAME}  # Dataset name (CIFAR-100)
    --num_classes=${NUM_CLASS_LABELS}  # Number of classes in the dataset
    --is_single_branch=${SINGLE_BRANCH}  # Use multi-branch model (set to 0)
    --is_amp=${AMP_MODE}  # Disable automatic mixed precision
    --num_rounds=${ROUNDS}  # Total number of rounds for federated learning
    --fed_epochs=${EPOCHS_PER_ROUND}  # Number of local epochs per round
    --spid=${EXP_ID}  # Set experiment ID for tracking
    --data=${DATA_PATH}  # Provide dataset path
    --model_dir=${OUTPUT_DIR}  # Directory where the model will be saved