use-case-and-architecture/EdgeFLite/scripts/EdgeFLite_W168_96c_650r4.sh

# -*- coding: utf-8 -*-
# @Author: Weisen Pan

# Load necessary modules and dependencies
source /etc/profile.d/modules.sh
module load gcc/11.2.0
module load openmpi/4.1.3
module load cuda/11.5/11.5.2
module load cudnn/8.3/8.3.3
module load nccl/2.11/2.11.4-1
module load python/3.10/3.10.4

# Activate the Python environment
source ~/venv/pytorch1.11+horovod/bin/activate

# Configure log directory and clean up any existing records
OUTPUT_LOG_DIR="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
rm -rf ${OUTPUT_LOG_DIR}
mkdir -p ${OUTPUT_LOG_DIR}

# Copy dataset to local directory for processing
LOCAL_DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/"
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${LOCAL_DATA_PATH}

# Switch to the working directory containing the training scripts
cd EdgeFLite

# Run the training script with specified settings for federated learning
python run_gkt.py \
    --is_fed=1 \                          # Enable federated learning mode
    --fixed_cluster=0 \                    # Use dynamic clustering
    --split_factor=1 \                     # Split factor for distributed computation
    --num_clusters=20 \                    # Number of clusters to create
    --num_selected=20 \                    # Number of selected clients per round
    --arch="wide_resnet16_8" \             # Architecture to use (Wide ResNet-16-8)
    --dataset="cifar10" \                  # Dataset to use (CIFAR-10)
    --num_classes=10 \                     # Number of classes in the dataset
    --is_single_branch=0 \                 # Disable single branch training mode
    --is_amp=0 \                           # Disable automatic mixed precision
    --num_rounds=300 \                     # Number of communication rounds
    --fed_epochs=1 \                       # Number of local epochs for each client per round
    --spid="fedgkt_wrn168_split1_cifar10_20clients_20choose_300rounds" \  # Unique ID for the experiment
    --data=${LOCAL_DATA_PATH}              # Local path to the dataset