diff --git a/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh b/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh
index 8ac44832e83f064576eb6884879bcbdc2ebd6715..df20d95dd69cd77e1205f9a9e58969aee1fdb980 100644
--- a/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh
+++ b/02-skylake_0096-single-node/03-single-node-multiple-tasks.sh
@@ -22,10 +22,10 @@
 #SBATCH --job-name="full node; single program"
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
-#SBATCH --cpus-per-task=12
+#SBATCH --cpus-per-task=24
 #SBATCH --partition=skylake_0096
 #SBATCH --qos=skylake_0096
 
 ../util/print_job_info.sh
 
-srun --cpus-per-task=12 ../util/print_task_info.sh 
+srun --cpus-per-task=24 ../util/print_task_info.sh 
diff --git a/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh b/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh
index a689e307680c01f83f82bb00bd873640c9f94bde..e3efa3f292cb0980936551166b200e79a16e2416 100644
--- a/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh
+++ b/03-skylake_0096-partial-node/03-task-based-multiple-cpus.sh
@@ -24,7 +24,7 @@
 
 #SBATCH --job-name="task based; 4 tasks; same program; 4 cores / task; 8 GB"
 #SBATCH --ntasks=4
-#SBATCH --cpus-per-task=4
+#SBATCH --cpus-per-task=8
 #SBATCH --mem=8G
 #SBATCH --partition=skylake_0096
 #SBATCH --qos=skylake_0096
@@ -33,4 +33,4 @@
 
 # from sbatch docs: 
 # Beginning with 22.05, srun will not inherit the --cpus-per-task value
-srun --cpus-per-task=4 ../util/print_task_info.sh 
+srun --cpus-per-task=8 ../util/print_task_info.sh 
diff --git a/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh b/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh
index bb6cefbc793ac795b4f951cb922a188125e54240..331ee8e8b0f154ec8e806c27c7c8a7e5385f0dc4 100644
--- a/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh
+++ b/03-skylake_0096-partial-node/04-task-based-multiple-programs.sh
@@ -24,7 +24,7 @@
 
 #SBATCH --job-name="task based; 4 tasks; different program; 4 cores / task; 8 GB"
 #SBATCH --ntasks=4
-#SBATCH --cpus-per-task=4
+#SBATCH --cpus-per-task=8
 #SBATCH --mem=8G
 #SBATCH --partition=skylake_0096
 #SBATCH --qos=skylake_0096
@@ -33,4 +33,4 @@
 
 # from sbatch docs: 
 # Beginning with 22.05, srun will not inherit the --cpus-per-task value
-srun --cpus-per-task=4 --multi-prog "04-task-based-multiple-programs.conf"
+srun --cpus-per-task=8 --multi-prog "04-task-based-multiple-programs.conf"
diff --git a/03-skylake_0096-partial-node/05-task-based-hetjob.sh b/03-skylake_0096-partial-node/05-task-based-hetjob.sh
index 31e4030d7611076d40972561713497a687da2191..0d030863f612bcec2ab7f51a099f229ac48eb6ff 100644
--- a/03-skylake_0096-partial-node/05-task-based-hetjob.sh
+++ b/03-skylake_0096-partial-node/05-task-based-hetjob.sh
@@ -31,7 +31,7 @@
 #SBATCH --qos=skylake_0096
 #SBATCH hetjob
 #SBATCH --ntasks=2
-#SBATCH --cpus-per-task=4
+#SBATCH --cpus-per-task=8
 #SBATCH --mem=8G
 #SBATCH --partition=skylake_0096
 #SBATCH --qos=skylake_0096
diff --git a/05-skylake_0096-job-array/02-task-based-job-array.sh b/05-skylake_0096-job-array/02-task-based-job-array.sh
index a66d9d52888f57583132d2f11409b6958036d3be..db43815fd8ce67fe99c8bb1945365970561fe1c3 100644
--- a/05-skylake_0096-job-array/02-task-based-job-array.sh
+++ b/05-skylake_0096-job-array/02-task-based-job-array.sh
@@ -22,11 +22,11 @@
 #SBATCH --job-name="job array; 2 jobs; same program; 1 task; 16 physical cores; 4 GB"
 #SBATCH --array=0-1
 #SBATCH --ntasks=1
-#SBATCH --cpus-per-task=16
+#SBATCH --cpus-per-task=32
 #SBATCH --mem=4G
 #SBATCH --partition=skylake_0096
 #SBATCH --qos=skylake_0096
 
 ../util/print_job_info.sh
 
-srun --cpus-per-task=16 ../util/print_task_info.sh 
+srun --cpus-per-task=32 ../util/print_task_info.sh 
diff --git a/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh b/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh
index 3cee2aa498d11bb8b424400a7a3790365cde26fc..38bc1800b5ab233faf8c567c39665b9917fe9463 100644
--- a/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh
+++ b/05-skylake_0096-job-array/05-node-based-multiple-tasks-job-array.sh
@@ -22,10 +22,10 @@
 #SBATCH --array=0-1
 #SBATCH --nodes=1
 #SBATCH --ntasks=4
-#SBATCH --cpus-per-task=12
+#SBATCH --cpus-per-task=24
 #SBATCH --partition=skylake_0096
 #SBATCH --qos=skylake_0096
 
 ../util/print_job_info.sh
 
-srun --cpus-per-task=12 ../util/print_task_info.sh 
+srun --cpus-per-task=24 ../util/print_task_info.sh 
diff --git a/98-frameworks-ray/01-ray-python-multi-node.sh b/98-frameworks-ray/01-ray-python-multi-node.sh
index ea7a4194205b2265fa9003d61beca30a39689df9..bee3518a1e1e6935865691d92f3f56b711ede1f1 100755
--- a/98-frameworks-ray/01-ray-python-multi-node.sh
+++ b/98-frameworks-ray/01-ray-python-multi-node.sh
@@ -42,8 +42,8 @@ echo "nodes_num: $nodes_num"
 tasks_per_node=$SLURM_NTASKS_PER_NODE
 echo "tasks_per_node: $tasks_per_node"
 
-# (physical) cpus per node (slurm cpus on node gives us logical cores)
-cpus_per_node=$(( SLURM_CPUS_ON_NODE / 2 ))
+# logical cpus per node (slurm cpus on node gives us logical cores)
+cpus_per_node=$(( SLURM_CPUS_ON_NODE ))
 echo "cpus_per_node: $cpus_per_node"
 
 # cpus per task: ray itself should do the work scheduling and hardware management 
diff --git a/98-frameworks-ray/02-ray-python-multi-node-gpu.sh b/98-frameworks-ray/02-ray-python-multi-node-gpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d37a4731d4c4a936ca7c59f356a8b9150f1a8138
--- /dev/null
+++ b/98-frameworks-ray/02-ray-python-multi-node-gpu.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+##############################################################################
+# User Request:
+#   - allocate multiple full nodes with A100 GPUs
+#   - run ray head on batch host
+#	- run ray workers in tasks distributed over all nodes
+#	- run python script on batch host (using the actual workers)
+#
+# Provided Allocation:
+#   - 2 exclusive nodes
+#   - 2x 64 physical cores / 128 logical cores
+#   - 2x 512 GB memory
+#   - 4x A100 GPU w 4x 40GB memory
+#
+# VSC policy:
+#   - '--nodes' flag set -> exclusive node allocation
+#   - '--ntasks-per-node' & '--ntasks' implicitly set to 64
+#		but overridden with srun arguments
+#   
+# Accounting:
+#   - 2x 64 core hours / hour
+##############################################################################
+
+#SBATCH --job-name=ray-test
+#SBATCH --qos=zen3_0512_a100x2_devel    # select zen3_0512_a100x2_devel devel qos for testing
+#SBATCH --partition=zen3_0512_a100x2	# select zen3_0512 hardware
+#SBATCH --time=00:10:00      		    # set time limit of 10 min for testing
+#SBATCH --nodes=2 					    # tell VSC slurm to allocate 2 exclusive nodes
+#SBATCH --gres=gpu:2                    # furthermore allocate 2 gpus per node (=full node)
+#SBATCH --tasks-per-node=1              # 1 task per node (1 head + 1 worker)
+#SBATCH --cpus-per-task=128             # 128 (logical) cpus per task
+#SBATCH --gpus-per-task=2               # 2 gpus per task
+#SBATCH --hint=nomultithread            # specify this to get 1 thread per physical core
+
+# optionally load packages and/or activate a conda environment...
+module load miniconda3
+eval "$(conda shell.bash hook)"
+conda activate ray
+
+# number of nodes
+nodes_num=$SLURM_JOB_NUM_NODES
+echo "nodes_num: $nodes_num"
+
+# tasks per node -> 1 task per node that starts a head/worker
+tasks_per_node=$SLURM_NTASKS_PER_NODE
+echo "tasks_per_node: $tasks_per_node"
+
+# cpus per task: let ray know how many cpus are there for internal scheduling
+cpus_per_task=$SLURM_CPUS_PER_TASK
+echo "cpus_per_task: $cpus_per_task"
+
+# gpus per task: get the number of gpus per task
+gpus_per_task=$SLURM_GPUS_PER_TASK
+echo "gpus_per_task: $gpus_per_task"
+
+# number of tasks & workers
+tasks_total=$(( nodes_num * tasks_per_node ))
+echo "tasks_total: $tasks_total"
+
+workers_total=$(( tasks_total - 1 ))
+echo "workers_total: $workers_total"
+echo ""
+
+# print general job info
+../util/print_job_info.sh
+
+# first task we start goes to batch host, so to get the ip adress
+# we can just use this nodes ib0 ip address for the head task
+head_node_ip=$( ip address show ib0 | awk '/inet / {print $2}' | cut -d "/" -f 1 )
+# head_node_ip=$( hostname --ip-address )
+export head_node_ip
+export head_node_port=6379
+
+logfile="slurm-${SLURM_JOB_ID}-head.out"
+srun_args_head="--nodes=1 --ntasks=1 --cpus-per-task=$cpus_per_task --gres=gpu:$gpus_per_task --input=none --output=$logfile"
+echo "starting head with: $srun_args_head"
+srun $srun_args_head \
+    ray start --head --log-color=false \
+        --node-ip-address="$head_node_ip" --port="$head_node_port" \
+        --num-cpus="$cpus_per_task" --num-gpus="$gpus_per_task" \
+        --block &
+        
+
+# wait to give the head worker time to start
+sleep 10s
+
+# start workers
+logfile="slurm-${SLURM_JOB_ID}-worker-%t.out"
+srun_args_worker="--relative=1 --nodes=$workers_total --ntasks=$workers_total --cpus-per-task=$cpus_per_task --gres=gpu:$gpus_per_task --input=none --output=$logfile"
+echo "starting $workers_total workers with: $srun_args_worker"
+srun $srun_args_worker \
+    ray start --log-color=false \
+        --address="$head_node_ip:$head_node_port" \
+        --num-cpus="$cpus_per_task" --num-gpus="$gpus_per_task" \
+        --block &
+        
+# wait for 10s to give the workers time to start
+sleep 10s
+
+# note: the started tasks will run as long as this batch process is existing (or they exit on their own)
+# so either wait for all child processes to finish 
+# wait
+
+#
+# 	OR
+#
+# run your own program (on the batch host)
+#   the assumption is that in such a setup the main script only schedules work and 
+#   waits for it to be finished. the program will be able to use the batch step's resources.
+#
+echo "$( date ): executing script ..."
+python3 ray-test.py "$head_node_ip:$head_node_port"
+echo "$( date ): script end."
+
+#
+# 	OR
+#
+# start your own program in another task
+#	better suited if there are also computationally intensive steps in the main program
+#   don't forget to adjust resource allocation accordingly e.g. allocate another node 
+#   and start workers at index 2
+#
+#srun $srun_args -r 1 ../my_program_script.sh
diff --git a/98-frameworks-ray/environment.yaml b/98-frameworks-ray/environment.yaml
index 43f1116c9c8eb11245852e64a43164c5c7ac99d5..c4376886eaf1004497824789898276b6a6855cdc 100644
--- a/98-frameworks-ray/environment.yaml
+++ b/98-frameworks-ray/environment.yaml
@@ -2,8 +2,10 @@ name: ray
 channels:
   - bioconda
   - conda-forge
-  - defaults
+  - nvidia
+  - pytorch
 dependencies:
-  - python=3.11
+  - python=3.*
   - libblas=*=*mkl
   - ray-all
+  - pytorch=*=*cuda12.1*
diff --git a/98-frameworks-ray/ray-test.py b/98-frameworks-ray/ray-test.py
index bed1c7908290e37af04821988173b333feeacc86..725a5d184753f42855848e01dbf7d624dc07b5d2 100644
--- a/98-frameworks-ray/ray-test.py
+++ b/98-frameworks-ray/ray-test.py
@@ -4,6 +4,12 @@ import socket
 import time
 import os
 import ray
+from ray import air, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.models import ModelCatalog
+from ray.rllib.examples.envs.classes.simple_rpg import SimpleRPG
+from ray.rllib.examples._old_api_stack.models.simple_rpg_model import CustomTorchRPGModel
+from ray.rllib.utils.metrics import NUM_ENV_STEPS_SAMPLED_LIFETIME
 
 
 @ray.remote
@@ -30,8 +36,10 @@ print(ray.cluster_resources())
 
 print("Call function:")
 print(ray.get(hello_world.remote()))
+print("")
 
-print("Test parallelism:")
+
+print("Test CPU parallelism with 200 tasks:")
 
 start = time.time()
 tasks = [get_hostname.remote()
@@ -46,3 +54,38 @@ print(f"Got {len(results)} results:")
 print(set(results))
 print(f"Scheduling took: {scheduling_end-start}s")
 print(f"Total time: {end-start}s")
+print("")
+
+
+##############################################################################
+# The following is only an example to demonstrate GPU usage in general
+# please change the arguments and settings as needed and dont assume the below
+# configuration is "correct" for you use case
+
+print("Running a PPO tuning example on GPU ...")
+print(f'CUDA_VISIBLE_DEVICES: {os.environ["CUDA_VISIBLE_DEVICES"]}')
+
+ModelCatalog.register_custom_model("my_model", CustomTorchRPGModel)
+
+config = (
+    PPOConfig()
+    .environment(SimpleRPG)
+    .framework("torch")
+    .env_runners(
+        rollout_fragment_length=1, 
+        num_env_runners=4, 
+        num_cpus_per_env_runner=32,
+        num_gpus_per_env_runner=1,        
+    )
+    .training(train_batch_size=128, model={"custom_model": "my_model"})
+)
+
+stop = {NUM_ENV_STEPS_SAMPLED_LIFETIME: 1}
+
+tuner = tune.Tuner(
+    "PPO",
+    param_space=config.to_dict(),
+    run_config=air.RunConfig(stop=stop, verbose=1),
+)
+
+print(tuner.fit())