4ec0a23e73
Change-Id: Ic4e43992e1674946cb69e0221659b0261259196c
44 lines
2.0 KiB
Bash
44 lines
2.0 KiB
Bash
# -*- coding: utf-8 -*-
|
|
# @Author: Weisen Pan
|
|
|
|
# Load necessary modules and dependencies
|
|
source /etc/profile.d/modules.sh
|
|
module load gcc/11.2.0
|
|
module load openmpi/4.1.3
|
|
module load cuda/11.5/11.5.2
|
|
module load cudnn/8.3/8.3.3
|
|
module load nccl/2.11/2.11.4-1
|
|
module load python/3.10/3.10.4
|
|
|
|
# Activate the Python environment
|
|
source ~/venv/pytorch1.11+horovod/bin/activate
|
|
|
|
# Configure log directory and clean up any existing records
|
|
OUTPUT_LOG_DIR="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}"
|
|
rm -rf ${OUTPUT_LOG_DIR}
|
|
mkdir -p ${OUTPUT_LOG_DIR}
|
|
|
|
# Copy dataset to local directory for processing
|
|
LOCAL_DATA_PATH="${SGE_LOCALDIR}/${JOB_ID}/"
|
|
cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${LOCAL_DATA_PATH}
|
|
|
|
# Switch to the working directory containing the training scripts
|
|
cd EdgeFLite
|
|
|
|
# Run the training script with specified settings for federated learning
|
|
python run_gkt.py \
|
|
--is_fed=1 \ # Enable federated learning mode
|
|
--fixed_cluster=0 \ # Use dynamic clustering
|
|
--split_factor=1 \ # Split factor for distributed computation
|
|
--num_clusters=20 \ # Number of clusters to create
|
|
--num_selected=20 \ # Number of selected clients per round
|
|
--arch="wide_resnet16_8" \ # Architecture to use (Wide ResNet-16-8)
|
|
--dataset="cifar10" \ # Dataset to use (CIFAR-10)
|
|
--num_classes=10 \ # Number of classes in the dataset
|
|
--is_single_branch=0 \ # Disable single branch training mode
|
|
--is_amp=0 \ # Disable automatic mixed precision
|
|
--num_rounds=300 \ # Number of communication rounds
|
|
--fed_epochs=1 \ # Number of local epochs for each client per round
|
|
--spid="fedgkt_wrn168_split1_cifar10_20clients_20choose_300rounds" \ # Unique ID for the experiment
|
|
--data=${LOCAL_DATA_PATH} # Local path to the dataset
|