diff --git a/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh b/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh index 8ac44832e83f064576eb6884879bcbdc2ebd6715..df20d95dd69cd77e1205f9a9e58969aee1fdb980 100644 --- a/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh +++ b/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh @@ -22,10 +22,10 @@ #SBATCH --job-name="full node; single program" #SBATCH --nodes=1 #SBATCH --ntasks=4 -#SBATCH --cpus-per-task=12 +#SBATCH --cpus-per-task=24 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 ../util/print_job_info.sh -srun --cpus-per-task=12 ../util/print_task_info.sh +srun --cpus-per-task=24 ../util/print_task_info.sh diff --git a/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh b/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh index a689e307680c01f83f82bb00bd873640c9f94bde..e3efa3f292cb0980936551166b200e79a16e2416 100644 --- a/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh +++ b/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh @@ -24,7 +24,7 @@ #SBATCH --job-name="task based; 4 tasks; same program; 4 cores / task; 8 GB" #SBATCH --ntasks=4 -#SBATCH --cpus-per-task=4 +#SBATCH --cpus-per-task=8 #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 @@ -33,4 +33,4 @@ # from sbatch docs: # Beginning with 22.05, srun will not inherit the --cpus-per-task value -srun --cpus-per-task=4 ../util/print_task_info.sh +srun --cpus-per-task=8 ../util/print_task_info.sh diff --git a/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh b/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh index bb6cefbc793ac795b4f951cb922a188125e54240..331ee8e8b0f154ec8e806c27c7c8a7e5385f0dc4 100644 --- a/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh +++ b/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh @@ -24,7 +24,7 @@ #SBATCH --job-name="task based; 4 tasks; different program; 4 cores / task; 8 GB" #SBATCH --ntasks=4 -#SBATCH --cpus-per-task=4 +#SBATCH --cpus-per-task=8 #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 @@ -33,4 +33,4 @@ # from sbatch docs: # Beginning with 22.05, srun will not inherit the --cpus-per-task value -srun --cpus-per-task=4 --multi-prog "04-task-based-multiple-programs.conf" +srun --cpus-per-task=8 --multi-prog "04-task-based-multiple-programs.conf" diff --git a/03-skylake_0096-partial-node/05-task-based-hetjob.sh b/03-skylake_0096-partial-node/05-task-based-hetjob.sh index 31e4030d7611076d40972561713497a687da2191..0d030863f612bcec2ab7f51a099f229ac48eb6ff 100644 --- a/03-skylake_0096-partial-node/05-task-based-hetjob.sh +++ b/03-skylake_0096-partial-node/05-task-based-hetjob.sh @@ -31,7 +31,7 @@ #SBATCH --qos=skylake_0096 #SBATCH hetjob #SBATCH --ntasks=2 -#SBATCH --cpus-per-task=4 +#SBATCH --cpus-per-task=8 #SBATCH --mem=8G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 diff --git a/05-skylake_0096-job-array/02-task-based-job-array.sh b/05-skylake_0096-job-array/02-task-based-job-array.sh index a66d9d52888f57583132d2f11409b6958036d3be..db43815fd8ce67fe99c8bb1945365970561fe1c3 100644 --- a/05-skylake_0096-job-array/02-task-based-job-array.sh +++ b/05-skylake_0096-job-array/02-task-based-job-array.sh @@ -22,11 +22,11 @@ #SBATCH --job-name="job array; 2 jobs; same program; 1 task; 16 physical cores; 4 GB" #SBATCH --array=0-1 #SBATCH --ntasks=1 -#SBATCH --cpus-per-task=16 +#SBATCH --cpus-per-task=32 #SBATCH --mem=4G #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 ../util/print_job_info.sh -srun --cpus-per-task=16 ../util/print_task_info.sh +srun --cpus-per-task=32 ../util/print_task_info.sh diff --git a/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh b/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh index 3cee2aa498d11bb8b424400a7a3790365cde26fc..38bc1800b5ab233faf8c567c39665b9917fe9463 100644 --- a/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh +++ b/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh @@ -22,10 +22,10 @@ #SBATCH --array=0-1 #SBATCH --nodes=1 #SBATCH --ntasks=4 -#SBATCH --cpus-per-task=12 +#SBATCH --cpus-per-task=24 #SBATCH --partition=skylake_0096 #SBATCH --qos=skylake_0096 ../util/print_job_info.sh -srun --cpus-per-task=12 ../util/print_task_info.sh +srun --cpus-per-task=24 ../util/print_task_info.sh diff --git a/98-frameworks-ray/01-ray-python-multi-node.sh b/98-frameworks-ray/01-ray-python-multi-node.sh index ea7a4194205b2265fa9003d61beca30a39689df9..bee3518a1e1e6935865691d92f3f56b711ede1f1 100755 --- a/98-frameworks-ray/01-ray-python-multi-node.sh +++ b/98-frameworks-ray/01-ray-python-multi-node.sh @@ -42,8 +42,8 @@ echo "nodes_num: $nodes_num" tasks_per_node=$SLURM_NTASKS_PER_NODE echo "tasks_per_node: $tasks_per_node" -# (physical) cpus per node (slurm cpus on node gives us logical cores) -cpus_per_node=$(( SLURM_CPUS_ON_NODE / 2 )) +# logical cpus per node (slurm cpus on node gives us logical cores) +cpus_per_node=$(( SLURM_CPUS_ON_NODE )) echo "cpus_per_node: $cpus_per_node" # cpus per task: ray itself should do the work scheduling and hardware management diff --git a/98-frameworks-ray/02-ray-python-multi-node-gpu.sh b/98-frameworks-ray/02-ray-python-multi-node-gpu.sh new file mode 100755 index 0000000000000000000000000000000000000000..d37a4731d4c4a936ca7c59f356a8b9150f1a8138 --- /dev/null +++ b/98-frameworks-ray/02-ray-python-multi-node-gpu.sh @@ -0,0 +1,124 @@ +#!/bin/bash + +############################################################################## +# User Request: +# - allocate multiple full nodes with A100 GPUs +# - run ray head on batch host +# - run ray workers in tasks distributed over all nodes +# - run python script on batch host (using the actual workers) +# +# Provided Allocation: +# - 2 exclusive nodes +# - 2x 64 physical cores / 128 logical cores +# - 2x 512 GB memory +# - 4x A100 GPU w 4x 40GB memory +# +# VSC policy: +# - '--nodes' flag set -> exclusive node allocation +# - '--ntasks-per-node' & '--ntasks' implicitly set to 64 +# but overridden with srun arguments +# +# Accounting: +# - 2x 64 core hours / hour +############################################################################## + +#SBATCH --job-name=ray-test +#SBATCH --qos=zen3_0512_a100x2_devel # select zen3_0512_a100x2_devel devel qos for testing +#SBATCH --partition=zen3_0512_a100x2 # select zen3_0512 hardware +#SBATCH --time=00:10:00 # set time limit of 10 min for testing +#SBATCH --nodes=2 # tell VSC slurm to allocate 2 exclusive nodes +#SBATCH --gres=gpu:2 # furthermore allocate 2 gpus per node (=full node) +#SBATCH --tasks-per-node=1 # 1 task per node (1 head + 1 worker) +#SBATCH --cpus-per-task=128 # 128 (logical) cpus per task +#SBATCH --gpus-per-task=2 # 2 gpus per task +#SBATCH --hint=nomultithread # specify this to get 1 thread per physical core + +# optionally load packages and/or activate a conda environment... +module load miniconda3 +eval "$(conda shell.bash hook)" +conda activate ray + +# number of nodes +nodes_num=$SLURM_JOB_NUM_NODES +echo "nodes_num: $nodes_num" + +# tasks per node -> 1 task per node that starts a head/worker +tasks_per_node=$SLURM_NTASKS_PER_NODE +echo "tasks_per_node: $tasks_per_node" + +# cpus per task: let ray know how many cpus are there for internal scheduling +cpus_per_task=$SLURM_CPUS_PER_TASK +echo "cpus_per_task: $cpus_per_task" + +# gpus per task: get the number of gpus per task +gpus_per_task=$SLURM_GPUS_PER_TASK +echo "gpus_per_task: $gpus_per_task" + +# number of tasks & workers +tasks_total=$(( nodes_num * tasks_per_node )) +echo "tasks_total: $tasks_total" + +workers_total=$(( tasks_total - 1 )) +echo "workers_total: $workers_total" +echo "" + +# print general job info +../util/print_job_info.sh + +# first task we start goes to batch host, so to get the ip adress +# we can just use this nodes ib0 ip address for the head task +head_node_ip=$( ip address show ib0 | awk '/inet / {print $2}' | cut -d "/" -f 1 ) +# head_node_ip=$( hostname --ip-address ) +export head_node_ip +export head_node_port=6379 + +logfile="slurm-${SLURM_JOB_ID}-head.out" +srun_args_head="--nodes=1 --ntasks=1 --cpus-per-task=$cpus_per_task --gres=gpu:$gpus_per_task --input=none --output=$logfile" +echo "starting head with: $srun_args_head" +srun $srun_args_head \ + ray start --head --log-color=false \ + --node-ip-address="$head_node_ip" --port="$head_node_port" \ + --num-cpus="$cpus_per_task" --num-gpus="$gpus_per_task" \ + --block & + + +# wait to give the head worker time to start +sleep 10s + +# start workers +logfile="slurm-${SLURM_JOB_ID}-worker-%t.out" +srun_args_worker="--relative=1 --nodes=$workers_total --ntasks=$workers_total --cpus-per-task=$cpus_per_task --gres=gpu:$gpus_per_task --input=none --output=$logfile" +echo "starting $workers_total workers with: $srun_args_worker" +srun $srun_args_worker \ + ray start --log-color=false \ + --address="$head_node_ip:$head_node_port" \ + --num-cpus="$cpus_per_task" --num-gpus="$gpus_per_task" \ + --block & + +# wait for 10s to give the workers time to start +sleep 10s + +# note: the started tasks will run as long as this batch process is existing (or they exit on their own) +# so either wait for all child processes to finish +# wait + +# +# OR +# +# run your own program (on the batch host) +# the assumption is that in such a setup the main script only schedules work and +# waits for it to be finished. the program will be able to use the batch step's resources. +# +echo "$( date ): executing script ..." +python3 ray-test.py "$head_node_ip:$head_node_port" +echo "$( date ): script end." + +# +# OR +# +# start your own program in another task +# better suited if there are also computationally intensive steps in the main program +# don't forget to adjust resource allocation accordingly e.g. allocate another node +# and start workers at index 2 +# +#srun $srun_args -r 1 ../my_program_script.sh diff --git a/98-frameworks-ray/environment.yaml b/98-frameworks-ray/environment.yaml index 43f1116c9c8eb11245852e64a43164c5c7ac99d5..c4376886eaf1004497824789898276b6a6855cdc 100644 --- a/98-frameworks-ray/environment.yaml +++ b/98-frameworks-ray/environment.yaml @@ -2,8 +2,10 @@ name: ray channels: - bioconda - conda-forge - - defaults + - nvidia + - pytorch dependencies: - - python=3.11 + - python=3.* - libblas=*=*mkl - ray-all + - pytorch=*=*cuda12.1* diff --git a/98-frameworks-ray/ray-test.py b/98-frameworks-ray/ray-test.py index bed1c7908290e37af04821988173b333feeacc86..725a5d184753f42855848e01dbf7d624dc07b5d2 100644 --- a/98-frameworks-ray/ray-test.py +++ b/98-frameworks-ray/ray-test.py @@ -4,6 +4,12 @@ import socket import time import os import ray +from ray import air, tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.models import ModelCatalog +from ray.rllib.examples.envs.classes.simple_rpg import SimpleRPG +from ray.rllib.examples._old_api_stack.models.simple_rpg_model import CustomTorchRPGModel +from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME @ray.remote @@ -30,8 +36,10 @@ print(ray.cluster_resources()) print("Call function:") print(ray.get(hello_world.remote())) +print("") -print("Test parallelism:") + +print("Test CPU parallelism with 200 tasks:") start = time.time() tasks = [get_hostname.remote() @@ -46,3 +54,38 @@ print(f"Got {len(results)} results:") print(set(results)) print(f"Scheduling took: {scheduling_end-start}s") print(f"Total time: {end-start}s") +print("") + + +############################################################################## +# The following is only an example to demonstrate GPU usage in general +# please change the arguments and settings as needed and dont assume the below +# configuration is "correct" for you use case + +print("Running a PPO tuning example on GPU ...") +print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}') + +ModelCatalog.register_custom_model("my_model", CustomTorchRPGModel) + +config = ( + PPOConfig() + .environment(SimpleRPG) + .framework("torch") + .env_runners( + rollout_fragment_length=1, + num_env_runners=4, + num_cpus_per_env_runner=32, + num_gpus_per_env_runner=1, + ) + .training(train_batch_size=128, model={"custom_model": "my_model"}) +) + +stop = {NUM_ENV_STEPS_SAMPLED_LIFETIME: 1} + +tuner = tune.Tuner( + "PPO", + param_space=config.to_dict(), + run_config=air.RunConfig(stop=stop, verbose=1), +) + +print(tuner.fit())