use-case-and-architecture/EdgeFLite/scripts/EdgeFLite_W168_96c_650r.sh

# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Initialize environment and load necessary modules
# This sets up the environment for running the necessary libraries like GCC, OpenMPI, CUDA, cuDNN, NCCL, and Python
source /etc/profile.d/modules.sh
module load gcc/11.2.0  # Load GCC version 11.2.0 for compiling
module load openmpi/4.1.3  # Load OpenMPI version 4.1.3 for distributed computing
module load cuda/11.5/11.5.2  # Load CUDA version 11.5.2 for GPU computing
module load cudnn/8.3/8.3.3  # Load cuDNN version 8.3.3 for deep learning frameworks
module load nccl/2.11/2.11.4-1  # Load NCCL version 2.11.4-1 for multi-GPU communication
module load python/3.10/3.10.4  # Load Python version 3.10.4

# Activate the Python virtual environment
# This activates the pre-configured virtual environment where necessary Python packages (e.g., PyTorch, Horovod) are installed
source ~/venv/pytorch1.11+horovod/bin/activate

# Prepare the log directory and clean up any old records
# Create a log directory for this job run and remove any previous log records
LOG_DIRECTORY="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
rm -rf ${LOG_DIRECTORY}  # Remove old logs if they exist
mkdir -p ${LOG_DIRECTORY}  # Create a new directory for current job logs

# Set up local data directory and copy dataset
# Define local data storage and copy the dataset for training the model
DATA_STORAGE="${SGE_LOCALDIR}/${JOB_ID}/"
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_STORAGE}  # Copy CIFAR-100 dataset to the local directory

# Change directory to project location
# Navigate to the EdgeFLite project directory to execute the training script
cd EdgeFLite

# Execute the training process for the federated learning model
# This runs the model training with specific hyperparameters for federated learning, including architecture, dataset, and configuration settings
python train_EdgeFLite.py \
    --is_fed=1 \  # Enable federated learning mode
    --fixed_cluster=0 \  # Disable fixed clusters, allowing dynamic changes
    --split_factor=16 \  # Set data split factor to 16
    --num_clusters=6 \  # Use 6 clusters for the federated learning process
    --num_selected=6 \  # Select 6 clients for each training round
    --arch="wide_resnetsl16_8" \  # Use a Wide ResNet architecture with depth 16 and width 8
    --dataset="cifar100" \  # Specify CIFAR-100 as the dataset
    --num_classes=100 \  # CIFAR-100 has 100 output classes
    --is_single_branch=0 \  # Use multi-branch (multi-head) learning
    --is_amp=0 \  # Disable automatic mixed precision training
    --num_rounds=650 \  # Train for 650 communication rounds
    --fed_epochs=1 \  # Each client trains for 1 epoch per round
    --spid="EdgeFLite_W168_96c_650r" \  # Set the unique identifier for the job
    --data=${DATA_STORAGE}  # Provide the location of the dataset