Add VSC5 scripts for FSDP llama3.1-70b

3a3b8f7b · Pfister, Martin · 7e9cf1bb · 3a3b8f7b · 3a3b8f7b
Commit 3a3b8f7b authored 1 year ago by Pfister, Martin
--- a/llama3.1-70b-bnb/run_vsc5a100_fsdp.slurm
+++ b/llama3.1-70b-bnb/run_vsc5a100_fsdp.slurm
+#!/bin/bash
+
+#SBATCH --partition=zen3_0512_a100x2
+#SBATCH --qos=zen3_0512_a100x2
+
+## Specify resources:
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:2  # up to 2 on VSC5/A100
+#SBATCH --ntasks-per-node=1
+## No need to specify RAM on VSC5, as it will be automatically
+## allocated depending on the number of GPUs requested.
+
+#SBATCH --time=3:00:00
+
+# Set conda environment:
+CONDA_ENV=finetuning
+
+# Load conda:
+module purge
+module load miniconda3
+
+# Include commands in output:
+set -x
+
+# Print current time and date:
+date
+
+# Print host name:
+hostname
+
+# List available GPUs:
+nvidia-smi
+
+# Set environment variables for communication between nodes:
+export MASTER_PORT=24998
+export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
+
+# Print statistics:
+echo "Using $((SLURM_NNODES * SLURM_GPUS_ON_NODE)) GPUs on $SLURM_NNODES nodes."
+
+# Run AI scripts:
+srun bash -c "conda run -n $CONDA_ENV --no-capture-output accelerate launch \
+    --num_machines $SLURM_NNODES \
+    --num_processes $((SLURM_NNODES * SLURM_GPUS_ON_NODE)) \
+    --num_cpu_threads_per_process 8 \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank \$SLURM_PROCID \
+    --config_file \"fsdp_config.yml\" \
+    llama3.1-70b_train.py"
--- a/llama3.1-70b-bnb/run_vsc5a40_fsdp.slurm
+++ b/llama3.1-70b-bnb/run_vsc5a40_fsdp.slurm
+#!/bin/bash
+
+#SBATCH --partition=zen2_0256_a40x2
+#SBATCH --qos=zen2_0256_a40x2
+
+## Specify resources:
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:2  # up to 2 on VSC5/A100
+#SBATCH --ntasks-per-node=1
+## No need to specify RAM on VSC5, as it will be automatically
+## allocated depending on the number of GPUs requested.
+
+#SBATCH --time=3:00:00
+
+# Set conda environment:
+CONDA_ENV=finetuning
+
+# Load conda:
+module purge
+module load miniconda3
+
+# Include commands in output:
+set -x
+
+# Print current time and date:
+date
+
+# Print host name:
+hostname
+
+# List available GPUs:
+nvidia-smi
+
+# Set environment variables for communication between nodes:
+export MASTER_PORT=24998
+export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
+
+# Print statistics:
+echo "Using $((SLURM_NNODES * SLURM_GPUS_ON_NODE)) GPUs on $SLURM_NNODES nodes."
+
+# Run AI scripts:
+srun bash -c "conda run -n $CONDA_ENV --no-capture-output accelerate launch \
+    --num_machines $SLURM_NNODES \
+    --num_processes $((SLURM_NNODES * SLURM_GPUS_ON_NODE)) \
+    --num_cpu_threads_per_process 8 \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank \$SLURM_PROCID \
+    --config_file \"fsdp_config.yml\" \
+    llama3.1-70b_train.py"