use-case-and-architecture/EdgeFLite/scripts/EdgeFLite_W168_96c_650r8.sh

# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Load environment modules and required dependencies
source /etc/profile.d/modules.sh
module load gcc/11.2.0    # Load GCC version 11.2.0
module load openmpi/4.1.3 # Load OpenMPI version 4.1.3
module load cuda/11.5/11.5.2 # Load CUDA version 11.5.2
module load cudnn/8.3/8.3.3  # Load cuDNN version 8.3.3
module load nccl/2.11/2.11.4-1 # Load NCCL version 2.11.4-1
module load python/3.10/3.10.4 # Load Python version 3.10.4

# Activate the virtual Python environment
source ~/venv/pytorch1.11+horovod/bin/activate # Activate a virtual environment for PyTorch and Horovod

# Define the log directory, clean up old records if any, and recreate the directory
LOG_PATH="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
rm -rf ${LOG_PATH} # Remove any existing log directory
mkdir -p ${LOG_PATH} # Create a new log directory

# Set up the local data directory and copy the dataset into it
DATA_STORAGE="${SGE_LOCALDIR}/${JOB_ID}/" # Define a local data directory for the job
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_STORAGE} # Copy CIFAR-100 dataset to the local directory

# Navigate to the working directory where training scripts are located
cd EdgeFLite # Change directory to the EdgeFLite project

# Execute the training script with federated learning parameters
python run_gkt.py \
    --is_fed=1 \                       # Enable federated learning
    --fixed_cluster=0 \                 # Allow dynamic cluster formation
    --split_factor=1 \                  # Data split factor
    --num_clusters=20 \                 # Number of clusters
    --num_selected=20 \                 # Number of selected clients per round
    --arch="wide_resnet16_8" \          # Network architecture: Wide ResNet 16-8
    --dataset="cifar10" \               # Use CIFAR-10 dataset
    --num_classes=10 \                  # Number of classes in CIFAR-10
    --is_single_branch=0 \              # Multi-branch network
    --is_amp=0 \                        # Disable Automatic Mixed Precision (AMP)
    --num_rounds=300 \                  # Number of federated learning rounds
    --fed_epochs=1 \                    # Number of local training epochs per round
    --cifar10_non_iid="quantity_skew" \ # Non-IID data distribution: quantity skew
    --spid="FGKT_W168_20c_skew" \       # Set a specific job identifier
    --data=${DATA_STORAGE}              # Path to the dataset