From 1589f815238185f2aa465a64f2de45ff3c923751 Mon Sep 17 00:00:00 2001 From: Muck Katrin <katrin.muck@tuwien.ac.at> Date: Fri, 21 Mar 2025 09:03:57 +0100 Subject: [PATCH 1/2] added spack info output, time limits and examples for a40x2 partition --- .../01-single-node-single-prog.sh | 1 + .../02-single-node-multi-prog.sh | 1 + .../03-single-node-multiple-tasks.sh | 1 + .../01a-core-based-single-prog.sh | 1 + .../01b-core-based-multi-prog.sh | 1 + .../02-task-based-default.sh | 1 + .../03-task-based-multiple-cpus.sh | 1 + .../04-task-based-multiple-programs.sh | 1 + .../05-task-based-hetjob.sh | 1 + .../01-multi-node-full-node.sh | 1 + .../01-core-based-job-array.sh | 1 + .../02-task-based-job-array.sh | 1 + .../03-node-based-job-array.sh | 1 + .../04-node-based-job-array-throttled.sh | 1 + .../05-node-based-multiple-tasks-job-array.sh | 1 + 06-zen3_0512_a100x2-gpu-based/01-half-node.sh | 1 + 06-zen3_0512_a100x2-gpu-based/02-full-node.sh | 1 + 07-zen2_0256_a40x2-gpu-based/01-half-node.sh | 29 +++++++++++++++++++ 07-zen2_0256_a40x2-gpu-based/02-full-node.sh | 28 ++++++++++++++++++ .../01-pytorch-python-single-gpu.sh | 1 + .../01-tensorflow-python-single-node.sh | 1 + 98-frameworks-ray/01-ray-python-multi-node.sh | 8 ++--- .../02-ray-python-multi-node-gpu.sh | 6 ++-- util/print_job_info.sh | 2 ++ util/spack_info.sh | 11 +++++++ 25 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 07-zen2_0256_a40x2-gpu-based/01-half-node.sh create mode 100644 07-zen2_0256_a40x2-gpu-based/02-full-node.sh create mode 100755 util/spack_info.sh diff --git a/02-skylake_0096-single-node/01-single-node-single-prog.sh b/02-skylake_0096-single-node/01-single-node-single-prog.sh index f3fca7f..def37d6 100644 --- a/02-skylake_0096-single-node/01-single-node-single-prog.sh +++ b/02-skylake_0096-single-node/01-single-node-single-prog.sh @@ -24,6 +24,7 @@ #SBATCH --nodes=1 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/02-skylake_0096-single-node/02-single-node-multi-prog.sh b/02-skylake_0096-single-node/02-single-node-multi-prog.sh index 3a76a90..8f39850 100644 --- a/02-skylake_0096-single-node/02-single-node-multi-prog.sh +++ b/02-skylake_0096-single-node/02-single-node-multi-prog.sh @@ -25,6 +25,7 @@ #SBATCH --nodes=1 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh b/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh index df20d95..ba666a1 100644 --- a/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh +++ b/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh @@ -25,6 +25,7 @@ #SBATCH --cpus-per-task=24 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/03-skylake_0096-partial-node/01a-core-based-single-prog.sh b/03-skylake_0096-partial-node/01a-core-based-single-prog.sh index 0cf35cb..48d9d78 100644 --- a/03-skylake_0096-partial-node/01a-core-based-single-prog.sh +++ b/03-skylake_0096-partial-node/01a-core-based-single-prog.sh @@ -25,5 +25,6 @@ #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/03-skylake_0096-partial-node/01b-core-based-multi-prog.sh b/03-skylake_0096-partial-node/01b-core-based-multi-prog.sh index 2a54990..ad0667a 100644 --- a/03-skylake_0096-partial-node/01b-core-based-multi-prog.sh +++ b/03-skylake_0096-partial-node/01b-core-based-multi-prog.sh @@ -26,6 +26,7 @@ #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/03-skylake_0096-partial-node/02-task-based-default.sh b/03-skylake_0096-partial-node/02-task-based-default.sh index 33d329d..99d52be 100644 --- a/03-skylake_0096-partial-node/02-task-based-default.sh +++ b/03-skylake_0096-partial-node/02-task-based-default.sh @@ -25,6 +25,7 @@ #SBATCH --mem=4G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh b/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh index e3efa3f..5f516ed 100644 --- a/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh +++ b/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh @@ -28,6 +28,7 @@ #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh b/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh index 331ee8e..fbb0802 100644 --- a/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh +++ b/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh @@ -28,6 +28,7 @@ #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/03-skylake_0096-partial-node/05-task-based-hetjob.sh b/03-skylake_0096-partial-node/05-task-based-hetjob.sh index 0d03086..8d1bf6f 100644 --- a/03-skylake_0096-partial-node/05-task-based-hetjob.sh +++ b/03-skylake_0096-partial-node/05-task-based-hetjob.sh @@ -35,6 +35,7 @@ #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/04-skylake_0096-multi-node/01-multi-node-full-node.sh b/04-skylake_0096-multi-node/01-multi-node-full-node.sh index 34cbb1d..8fceefa 100644 --- a/04-skylake_0096-multi-node/01-multi-node-full-node.sh +++ b/04-skylake_0096-multi-node/01-multi-node-full-node.sh @@ -26,6 +26,7 @@ #SBATCH --cpus-per-task=48 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/05-skylake_0096-job-array/01-core-based-job-array.sh b/05-skylake_0096-job-array/01-core-based-job-array.sh index b54a93e..2e33fff 100644 --- a/05-skylake_0096-job-array/01-core-based-job-array.sh +++ b/05-skylake_0096-job-array/01-core-based-job-array.sh @@ -26,6 +26,7 @@ #SBATCH --mem=4G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/05-skylake_0096-job-array/02-task-based-job-array.sh b/05-skylake_0096-job-array/02-task-based-job-array.sh index db43815..949c165 100644 --- a/05-skylake_0096-job-array/02-task-based-job-array.sh +++ b/05-skylake_0096-job-array/02-task-based-job-array.sh @@ -26,6 +26,7 @@ #SBATCH --mem=4G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/05-skylake_0096-job-array/03-node-based-job-array.sh b/05-skylake_0096-job-array/03-node-based-job-array.sh index 87d5cb6..5aef036 100644 --- a/05-skylake_0096-job-array/03-node-based-job-array.sh +++ b/05-skylake_0096-job-array/03-node-based-job-array.sh @@ -20,6 +20,7 @@ #SBATCH --nodes=1 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/05-skylake_0096-job-array/04-node-based-job-array-throttled.sh b/05-skylake_0096-job-array/04-node-based-job-array-throttled.sh index 2d1d1e6..96a3647 100644 --- a/05-skylake_0096-job-array/04-node-based-job-array-throttled.sh +++ b/05-skylake_0096-job-array/04-node-based-job-array-throttled.sh @@ -21,6 +21,7 @@ #SBATCH --nodes=1 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh b/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh index 38bc180..caba7b4 100644 --- a/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh +++ b/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh @@ -25,6 +25,7 @@ #SBATCH --cpus-per-task=24 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/06-zen3_0512_a100x2-gpu-based/01-half-node.sh b/06-zen3_0512_a100x2-gpu-based/01-half-node.sh index e67ee30..0c6e02e 100644 --- a/06-zen3_0512_a100x2-gpu-based/01-half-node.sh +++ b/06-zen3_0512_a100x2-gpu-based/01-half-node.sh @@ -24,5 +24,6 @@ #SBATCH --partition=zen3_0512_a100x2 #SBATCH --qos=zen3_0512_a100x2 #SBATCH --gres=gpu:1 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/06-zen3_0512_a100x2-gpu-based/02-full-node.sh b/06-zen3_0512_a100x2-gpu-based/02-full-node.sh index 49a66be..6204b6b 100644 --- a/06-zen3_0512_a100x2-gpu-based/02-full-node.sh +++ b/06-zen3_0512_a100x2-gpu-based/02-full-node.sh @@ -23,5 +23,6 @@ #SBATCH --partition=zen3_0512_a100x2 #SBATCH --qos=zen3_0512_a100x2 #SBATCH --gres=gpu:2 +#SBATCH --time=00:05:00 # set low time limit for testing ../util/print_job_info.sh diff --git a/07-zen2_0256_a40x2-gpu-based/01-half-node.sh b/07-zen2_0256_a40x2-gpu-based/01-half-node.sh new file mode 100644 index 0000000..82a2883 --- /dev/null +++ b/07-zen2_0256_a40x2-gpu-based/01-half-node.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +############################################################################## +# User Request: +# - run in qos 'zen2_0256_a40x2' +# - get access to 1 gpu +# +# Provided Allocation: +# - non-exclusive (shared not set) +# - 8 physical cores / 16 logical cores +# - 128 GB memory +# - (implicitly): 8 tasks on 1 node +# - (implicitly): 1 physical core bound to each task +# +# VSC policy: +# - 'SingleCore' feature set -> only gets scheduled on `SingleCore` nodes +# - '--ntasks-per-node' & '--ntasks' implicitly set to 8 +# - '--mem' (per node) implicitly set to 128 GB +# +# Accounting: +# - 8 core hours / hour +############################################################################## + +#SBATCH --partition=zen2_0256_a40x2 +#SBATCH --qos=zen2_0256_a40x2 +#SBATCH --gres=gpu:1 +#SBATCH --time=00:05:00 # set low time limit for testing + +../util/print_job_info.sh diff --git a/07-zen2_0256_a40x2-gpu-based/02-full-node.sh b/07-zen2_0256_a40x2-gpu-based/02-full-node.sh new file mode 100644 index 0000000..2561c0f --- /dev/null +++ b/07-zen2_0256_a40x2-gpu-based/02-full-node.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +############################################################################## +# User Request: +# - run in qos 'zen2_0256_a40x2' +# - get access to 2 gpus +# +# Provided Allocation: +# - exclusive access +# - 16 physical cores / 32 logical cores +# - 256 GB memory +# - (implicitly): 16 tasks on 1 node +# - (implicitly): 1 physical core bound to each task +# +# VSC policy: +# - '--ntasks-per-node' & '--ntasks' implicitly set to 16 +# - '--mem' (per node) implicitly set to 256 GB +# +# Accounting: +# - 16 core hours / hour +############################################################################## + +#SBATCH --partition=zen2_0256_a40x2 +#SBATCH --qos=zen2_0256_a40x2 +#SBATCH --gres=gpu:2 +#SBATCH --time=00:05:00 # set low time limit for testing + +../util/print_job_info.sh diff --git a/96-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh b/96-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh index b26f823..f106fb6 100644 --- a/96-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh +++ b/96-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh @@ -21,6 +21,7 @@ #SBATCH --partition=zen2_0256_a40x2 #SBATCH --qos=zen2_0256_a40x2 #SBATCH --gres=gpu:1 +#SBATCH --time=00:10:00 # set low time limit for testing # optionally activate a conda or python environment module load miniconda3 diff --git a/97-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh b/97-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh index 1ae0624..f90ae7a 100644 --- a/97-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh +++ b/97-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh @@ -21,6 +21,7 @@ #SBATCH --partition=zen2_0256_a40x2 #SBATCH --qos=zen2_0256_a40x2 #SBATCH --gres=gpu:2 +#SBATCH --time=00:10:00 # set low time limit for testing # optionally activate a conda or python environment module load miniconda3 diff --git a/98-frameworks-ray/01-ray-python-multi-node.sh b/98-frameworks-ray/01-ray-python-multi-node.sh index bee3518..9f8cd91 100755 --- a/98-frameworks-ray/01-ray-python-multi-node.sh +++ b/98-frameworks-ray/01-ray-python-multi-node.sh @@ -22,10 +22,10 @@ ############################################################################## #SBATCH --job-name=ray-test -#SBATCH --qos=zen3_0512 # select zen3_0512 default qos -#SBATCH --partition=zen3_0512 # select zen3_0512 hardware -#SBATCH --nodes=3 # tell VSC slurm to allocate 3 exclusive nodes -#SBATCH --time=00:10:00 # set time limit of 5 min for testing +#SBATCH --qos=zen3_0512 # select zen3_0512 default qos +#SBATCH --partition=zen3_0512 # select zen3_0512 hardware +#SBATCH --nodes=3 # tell VSC slurm to allocate 3 exclusive nodes +#SBATCH --time=00:10:00 # set low time limit for testing #SBATCH --tasks-per-node=1 # 1 task per node (1 head + 2 workers) # optionally activate a conda or python environment diff --git a/98-frameworks-ray/02-ray-python-multi-node-gpu.sh b/98-frameworks-ray/02-ray-python-multi-node-gpu.sh index d37a473..ff8dc13 100755 --- a/98-frameworks-ray/02-ray-python-multi-node-gpu.sh +++ b/98-frameworks-ray/02-ray-python-multi-node-gpu.sh @@ -24,9 +24,9 @@ #SBATCH --job-name=ray-test #SBATCH --qos=zen3_0512_a100x2_devel # select zen3_0512_a100x2_devel devel qos for testing -#SBATCH --partition=zen3_0512_a100x2 # select zen3_0512 hardware -#SBATCH --time=00:10:00 # set time limit of 10 min for testing -#SBATCH --nodes=2 # tell VSC slurm to allocate 2 exclusive nodes +#SBATCH --partition=zen3_0512_a100x2 # select zen3_0512 hardware +#SBATCH --time=00:10:00 # set low time limit for testing +#SBATCH --nodes=2 # tell VSC slurm to allocate 2 exclusive nodes #SBATCH --gres=gpu:2 # furthermore allocate 2 gpus per node (=full node) #SBATCH --tasks-per-node=1 # 1 task per node (1 head + 1 worker) #SBATCH --cpus-per-task=128 # 128 (logical) cpus per task diff --git a/util/print_job_info.sh b/util/print_job_info.sh index 00ccc35..9f64862 100755 --- a/util/print_job_info.sh +++ b/util/print_job_info.sh @@ -21,3 +21,5 @@ $SCRIPT_DIR/computed_available_resources.sh $SCRIPT_DIR/cgroup_resources.sh "/slurm/uid_${SLURM_JOB_UID}/job_${SLURM_JOB_ID}" $SCRIPT_DIR/slurm_vars.sh + +$SCRIPT_DIR/spack_info.sh \ No newline at end of file diff --git a/util/spack_info.sh b/util/spack_info.sh new file mode 100755 index 0000000..03061b5 --- /dev/null +++ b/util/spack_info.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +echo "" +echo "# spack env" + +if [[ -z "$SPACK_TREE" ]]; then + echo " <no spack env info found or incomplete>" +else + echo " - name: $SPACK_TREE" + echo " - root: $SPACK_ROOT" +fi -- GitLab From 1d548926eb4ada70c83dc1128d8aa25dc6fac54d Mon Sep 17 00:00:00 2001 From: Muck Katrin <katrin.muck@tuwien.ac.at> Date: Fri, 21 Mar 2025 09:25:56 +0100 Subject: [PATCH 2/2] added tactical module purge statements and spack environment activation commands for gpu partitions --- 06-zen3_0512_a100x2-gpu-based/01-half-node.sh | 5 ++ 06-zen3_0512_a100x2-gpu-based/02-full-node.sh | 5 ++ 07-zen2_0256_a40x2-gpu-based/01-half-node.sh | 5 ++ 07-zen2_0256_a40x2-gpu-based/02-full-node.sh | 5 ++ .../01-pytorch-python-single-gpu.sh | 5 +- .../README.md | 2 +- .../environment.yaml | 0 .../pytorch-test.py | 0 .../01-tensorflow-python-single-node.sh | 5 +- .../environment.yaml | 0 .../tensorflow-test.py | 0 .../01-ray-python-multi-node.sh | 5 +- .../02-ray-python-multi-node-gpu.sh | 5 +- .../environment.yaml | 0 .../ray-test.py | 0 README.md | 10 ++- util/unload_jupyter_env.sh | 73 +++++++++++++++++++ 17 files changed, 118 insertions(+), 7 deletions(-) rename {96-frameworks-pytorch-cuda => 80-frameworks-pytorch-cuda}/01-pytorch-python-single-gpu.sh (90%) rename {96-frameworks-pytorch-cuda => 80-frameworks-pytorch-cuda}/README.md (96%) rename {96-frameworks-pytorch-cuda => 80-frameworks-pytorch-cuda}/environment.yaml (100%) rename {96-frameworks-pytorch-cuda => 80-frameworks-pytorch-cuda}/pytorch-test.py (100%) rename {97-frameworks-tensorflow-cuda => 81-frameworks-tensorflow-cuda}/01-tensorflow-python-single-node.sh (91%) rename {97-frameworks-tensorflow-cuda => 81-frameworks-tensorflow-cuda}/environment.yaml (100%) rename {97-frameworks-tensorflow-cuda => 81-frameworks-tensorflow-cuda}/tensorflow-test.py (100%) rename {98-frameworks-ray => 82-frameworks-ray}/01-ray-python-multi-node.sh (97%) rename {98-frameworks-ray => 82-frameworks-ray}/02-ray-python-multi-node-gpu.sh (98%) rename {98-frameworks-ray => 82-frameworks-ray}/environment.yaml (100%) rename {98-frameworks-ray => 82-frameworks-ray}/ray-test.py (100%) create mode 100644 util/unload_jupyter_env.sh diff --git a/06-zen3_0512_a100x2-gpu-based/01-half-node.sh b/06-zen3_0512_a100x2-gpu-based/01-half-node.sh index 0c6e02e..b046670 100644 --- a/06-zen3_0512_a100x2-gpu-based/01-half-node.sh +++ b/06-zen3_0512_a100x2-gpu-based/01-half-node.sh @@ -26,4 +26,9 @@ #SBATCH --gres=gpu:1 #SBATCH --time=00:05:00 # set low time limit for testing +# purge all previously loaded modules +module purge +# enable cuda-zen tree to have the gpu software packages available +spackup cuda-zen + ../util/print_job_info.sh diff --git a/06-zen3_0512_a100x2-gpu-based/02-full-node.sh b/06-zen3_0512_a100x2-gpu-based/02-full-node.sh index 6204b6b..c721018 100644 --- a/06-zen3_0512_a100x2-gpu-based/02-full-node.sh +++ b/06-zen3_0512_a100x2-gpu-based/02-full-node.sh @@ -25,4 +25,9 @@ #SBATCH --gres=gpu:2 #SBATCH --time=00:05:00 # set low time limit for testing +# purge all previously loaded modules +module purge +# enable cuda-zen tree to have the gpu software packages available +spackup cuda-zen + ../util/print_job_info.sh diff --git a/07-zen2_0256_a40x2-gpu-based/01-half-node.sh b/07-zen2_0256_a40x2-gpu-based/01-half-node.sh index 82a2883..88c9d90 100644 --- a/07-zen2_0256_a40x2-gpu-based/01-half-node.sh +++ b/07-zen2_0256_a40x2-gpu-based/01-half-node.sh @@ -26,4 +26,9 @@ #SBATCH --gres=gpu:1 #SBATCH --time=00:05:00 # set low time limit for testing +# purge all previously loaded modules +module purge +# enable cuda-zen tree to have the gpu software packages available +spackup cuda-zen + ../util/print_job_info.sh diff --git a/07-zen2_0256_a40x2-gpu-based/02-full-node.sh b/07-zen2_0256_a40x2-gpu-based/02-full-node.sh index 2561c0f..991fef7 100644 --- a/07-zen2_0256_a40x2-gpu-based/02-full-node.sh +++ b/07-zen2_0256_a40x2-gpu-based/02-full-node.sh @@ -25,4 +25,9 @@ #SBATCH --gres=gpu:2 #SBATCH --time=00:05:00 # set low time limit for testing +# purge all previously loaded modules +module purge +# enable cuda-zen tree to have the gpu software packages available +spackup cuda-zen + ../util/print_job_info.sh diff --git a/96-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh b/80-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh similarity index 90% rename from 96-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh rename to 80-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh index f106fb6..ca6f7dd 100644 --- a/96-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh +++ b/80-frameworks-pytorch-cuda/01-pytorch-python-single-gpu.sh @@ -23,8 +23,11 @@ #SBATCH --gres=gpu:1 #SBATCH --time=00:10:00 # set low time limit for testing +# purge all previously loaded modules +module purge + # optionally activate a conda or python environment -module load miniconda3 +module load miniconda3/latest eval "$(conda shell.bash hook)" conda activate pytorch-cuda diff --git a/96-frameworks-pytorch-cuda/README.md b/80-frameworks-pytorch-cuda/README.md similarity index 96% rename from 96-frameworks-pytorch-cuda/README.md rename to 80-frameworks-pytorch-cuda/README.md index 1438a34..1385f81 100644 --- a/96-frameworks-pytorch-cuda/README.md +++ b/80-frameworks-pytorch-cuda/README.md @@ -3,7 +3,7 @@ To install the environment simply use conda ```bash -module load miniconda3 +module load miniconda3/latest eval "$(conda shell.bash hook)" conda env create -f environment.yaml ``` diff --git a/96-frameworks-pytorch-cuda/environment.yaml b/80-frameworks-pytorch-cuda/environment.yaml similarity index 100% rename from 96-frameworks-pytorch-cuda/environment.yaml rename to 80-frameworks-pytorch-cuda/environment.yaml diff --git a/96-frameworks-pytorch-cuda/pytorch-test.py b/80-frameworks-pytorch-cuda/pytorch-test.py similarity index 100% rename from 96-frameworks-pytorch-cuda/pytorch-test.py rename to 80-frameworks-pytorch-cuda/pytorch-test.py diff --git a/97-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh b/81-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh similarity index 91% rename from 97-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh rename to 81-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh index f90ae7a..7e6b48c 100644 --- a/97-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh +++ b/81-frameworks-tensorflow-cuda/01-tensorflow-python-single-node.sh @@ -23,8 +23,11 @@ #SBATCH --gres=gpu:2 #SBATCH --time=00:10:00 # set low time limit for testing +# purge all previously loaded modules +module purge + # optionally activate a conda or python environment -module load miniconda3 +module load miniconda3/latest eval "$(conda shell.bash hook)" conda activate tensorflow-cuda diff --git a/97-frameworks-tensorflow-cuda/environment.yaml b/81-frameworks-tensorflow-cuda/environment.yaml similarity index 100% rename from 97-frameworks-tensorflow-cuda/environment.yaml rename to 81-frameworks-tensorflow-cuda/environment.yaml diff --git a/97-frameworks-tensorflow-cuda/tensorflow-test.py b/81-frameworks-tensorflow-cuda/tensorflow-test.py similarity index 100% rename from 97-frameworks-tensorflow-cuda/tensorflow-test.py rename to 81-frameworks-tensorflow-cuda/tensorflow-test.py diff --git a/98-frameworks-ray/01-ray-python-multi-node.sh b/82-frameworks-ray/01-ray-python-multi-node.sh similarity index 97% rename from 98-frameworks-ray/01-ray-python-multi-node.sh rename to 82-frameworks-ray/01-ray-python-multi-node.sh index 9f8cd91..6d07b83 100755 --- a/98-frameworks-ray/01-ray-python-multi-node.sh +++ b/82-frameworks-ray/01-ray-python-multi-node.sh @@ -28,9 +28,12 @@ #SBATCH --time=00:10:00 # set low time limit for testing #SBATCH --tasks-per-node=1 # 1 task per node (1 head + 2 workers) +# purge all previously loaded modules +module purge + # optionally activate a conda or python environment module load openmpi/4.1.6-gcc-12.2.0-exh7lqk -module load miniconda3 +module load miniconda3/latest eval "$(conda shell.bash hook)" conda activate ray diff --git a/98-frameworks-ray/02-ray-python-multi-node-gpu.sh b/82-frameworks-ray/02-ray-python-multi-node-gpu.sh similarity index 98% rename from 98-frameworks-ray/02-ray-python-multi-node-gpu.sh rename to 82-frameworks-ray/02-ray-python-multi-node-gpu.sh index ff8dc13..afdabb8 100755 --- a/98-frameworks-ray/02-ray-python-multi-node-gpu.sh +++ b/82-frameworks-ray/02-ray-python-multi-node-gpu.sh @@ -33,8 +33,11 @@ #SBATCH --gpus-per-task=2 # 2 gpus per task #SBATCH --hint=nomultithread # specify this to get 1 thread per physical core +# purge all previously loaded modules +module purge + # optionally load packages and/or activate a conda environment... -module load miniconda3 +module load miniconda3/latest eval "$(conda shell.bash hook)" conda activate ray diff --git a/98-frameworks-ray/environment.yaml b/82-frameworks-ray/environment.yaml similarity index 100% rename from 98-frameworks-ray/environment.yaml rename to 82-frameworks-ray/environment.yaml diff --git a/98-frameworks-ray/ray-test.py b/82-frameworks-ray/ray-test.py similarity index 100% rename from 98-frameworks-ray/ray-test.py rename to 82-frameworks-ray/ray-test.py diff --git a/README.md b/README.md index d87115d..297cdf6 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,16 @@ The purpose of this repository is to have a set of slurm job scripts with expected results. -This way we have example we can give to users as a starting point as well as have something to test our lua implementation against. +This way we have example we can give to users as a starting point as well as have something to test our slurm lua implementation against. +## How to run examples -# Explanations +To run an example `cd` into the folder and use `sbatch` to schedule a slurm script. + +``` +cd 01-basics +sbatch 01-node-simple.sh +``` ## Exclusive/Shared Nodes (OverSubscribe) diff --git a/util/unload_jupyter_env.sh b/util/unload_jupyter_env.sh new file mode 100644 index 0000000..5958458 --- /dev/null +++ b/util/unload_jupyter_env.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +## +# source this script from a jupyter terminal or notebook cell +# to unset all jupyter related env variables and functions +# +# in terminal: +# $ source ./unload_jupyter_env.sh +# $ sbatch myjob.sh +# +# in jupyter notebook: +# ! source ./unload_jupyter_env.sh && sbatch myjob.sh +## + +DEBUG="$1" + +function debug_print() { + if [ -z "$DEBUG" ]; then + return + fi + echo "$@" +} + + +if conda -V >/dev/null 2>&1; then + eval "$(conda shell.bash hook)" + + for i in $(seq ${CONDA_SHLVL}); do + conda deactivate + done + + debug_print "Deactivated all conda envs ..." +else + debug_print "No conda found." +fi + +PREVIOUS_IFS="$IFS" +IFS=$'\n' +SLURM_VARS=$( env | sort | grep -E "^SLURM_.*=" | sed "s/=.*//g" ) +for var in $SLURM_VARS; do + unset $var +done +debug_print "Unset all SLURM_* env variables ..." +IFS="$PREVIOUS_IFS" + +spack unload +debug_print "Unloaded all spack packages ..." + +module purge +debug_print "Unloaded all modules ..." + +# sanitize LD_LIBRARY_PATH by removing all paths from spack base +spack_base=$( readlink -f "$( dirname $( which spack ) )/../" ) +library_path=${LD_LIBRARY_PATH//:/ } +new_library_path= +for path in $library_path; do + if [[ $path =~ $spack_base ]]; then + continue + fi + if [[ $new_library_path =~ $path ]]; then + continue + fi + if [ -z "$new_library_path" ]; then + new_library_path="$path" + else + new_library_path="$new_library_path:$path" + fi +done +export LD_LIBRARY_PATH="$new_library_path" +export LIBRARY_PATH= +debug_print "Removed all spack library paths ..." + +echo "Jupyter env (conda, slurm & spack) unloaded." -- GitLab