# -*- coding: utf-8 -*- # @Author: Weisen Pan # Initialize environment and load necessary modules # This sets up the environment for running the necessary libraries like GCC, OpenMPI, CUDA, cuDNN, NCCL, and Python source /etc/profile.d/modules.sh module load gcc/11.2.0 # Load GCC version 11.2.0 for compiling module load openmpi/4.1.3 # Load OpenMPI version 4.1.3 for distributed computing module load cuda/11.5/11.5.2 # Load CUDA version 11.5.2 for GPU computing module load cudnn/8.3/8.3.3 # Load cuDNN version 8.3.3 for deep learning frameworks module load nccl/2.11/2.11.4-1 # Load NCCL version 2.11.4-1 for multi-GPU communication module load python/3.10/3.10.4 # Load Python version 3.10.4 # Activate the Python virtual environment # This activates the pre-configured virtual environment where necessary Python packages (e.g., PyTorch, Horovod) are installed source ~/venv/pytorch1.11+horovod/bin/activate # Prepare the log directory and clean up any old records # Create a log directory for this job run and remove any previous log records LOG_DIRECTORY="/home/projadmin/Federated_Learning/project_EdgeFLite/records/${JOB_NAME}_${JOB_ID}" rm -rf ${LOG_DIRECTORY} # Remove old logs if they exist mkdir -p ${LOG_DIRECTORY} # Create a new directory for current job logs # Set up local data directory and copy dataset # Define local data storage and copy the dataset for training the model DATA_STORAGE="${SGE_LOCALDIR}/${JOB_ID}/" cp -r ../summit2024/simpleFL/performance_test/cifar100/data ${DATA_STORAGE} # Copy CIFAR-100 dataset to the local directory # Change directory to project location # Navigate to the EdgeFLite project directory to execute the training script cd EdgeFLite # Execute the training process for the federated learning model # This runs the model training with specific hyperparameters for federated learning, including architecture, dataset, and configuration settings python train_EdgeFLite.py \ --is_fed=1 \ # Enable federated learning mode --fixed_cluster=0 \ # Disable fixed clusters, allowing dynamic changes --split_factor=16 \ # Set data split factor to 16 --num_clusters=6 \ # Use 6 clusters for the federated learning process --num_selected=6 \ # Select 6 clients for each training round --arch="wide_resnetsl16_8" \ # Use a Wide ResNet architecture with depth 16 and width 8 --dataset="cifar100" \ # Specify CIFAR-100 as the dataset --num_classes=100 \ # CIFAR-100 has 100 output classes --is_single_branch=0 \ # Use multi-branch (multi-head) learning --is_amp=0 \ # Disable automatic mixed precision training --num_rounds=650 \ # Train for 650 communication rounds --fed_epochs=1 \ # Each client trains for 1 epoch per round --spid="EdgeFLite_W168_96c_650r" \ # Set the unique identifier for the job --data=${DATA_STORAGE} # Provide the location of the dataset