*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects
Commit 1e8d6fcc authored by Muck, Katrin's avatar Muck, Katrin
Browse files

added new batch script for task based ray workers

parent 1af7c18a
No related branches found
No related tags found
No related merge requests found
#!/bin/bash
##############################################################################
# User Request:
# - allocate multiple full nodes
# - run ray head on batch host
# - run ray workers in tasks distributed over all nodes
# - run python script on batch host (using the actual workers)
#
# Provided Allocation:
# - 2 exclusive nodes
# - 2x 64 physical cores / 128 logical cores
# - 2x 512 GB memory
#
# VSC policy:
# - '--nodes' flag set -> exclusive node allocation
# - '--ntasks-per-node' & '--ntasks' implicitly set to 64
# but overridden with srun arguments
#
# Accounting:
# - 2x 64 core hours / hour
##############################################################################
#SBATCH --job-name=ray-test
#SBATCH --qos=zen3_0512 # select zen3_0512 default qos
#SBATCH --partition=zen3_0512 # select zen3_0512 hardware
#SBATCH --nodes=2 # tell VSC slurm to allocate 2 exclusive nodes
#SBATCH --time=00:05:00 # set time limit of 5 min for testing
# number of nodes
nodes_num=$SLURM_JOB_NUM_NODES
echo "nodes_num: $nodes_num"
# (physical) cpus per node (slurm cpus on node gives us logical cores)
cpus_per_node=$(( SLURM_CPUS_ON_NODE / 2 ))
echo "cpus_per_node: $cpus_per_node"
# cpus per task
cpus_per_task=32
echo "cpus_per_task: $cpus_per_task"
# tasks per node
tasks_per_node=$(( cpus_per_node / cpus_per_task ))
echo "tasks_per_node: $tasks_per_node"
# number of workers
tasks_total=$(( nodes_num * tasks_per_node ))
echo "tasks_total: $tasks_total"
# print general job info
../util/print_job_info.sh
# start the ray head directly on the batch host in the background
../util/print_task_info_min.sh "ray-head" "hello from ray head!" &
echo "starting ray head in background ..."
echo "ray-head" &
# next we start all workers with the configured number of cpus per task
srun --ntasks-per-node=$tasks_per_node --ntasks=$tasks_total --cpus-per-task=$cpus_per_task \
../util/print_task_info_min.sh "ray-worker" "hello from ray worker!" &
# note: the started tasks will run as long as this batch process is existing (or they exit on their own)
# so either wait for all child processes to finish
wait
#
# OR
#
# run your own program (on the batch host)
#
#module load miniconda3
#eval "$(conda shell.bash hook)"
#conda activate my-env
#python my_program.py
#
# OR
#
# start your own program in another task
# don't forget to adjust resource allocation accordingly
# e.g. allocate another node and start workers at index 2
#
#srun $srun_args -r 1 ../my_program_script.sh
#!/usr/bin/env bash
TASK_NAME="${SLURM_TASK_PID}"
TASK_NAME="$SLURM_TASK_PID"
if [ -n "$1" ]; then
TASK_NAME="$1"
TASK_NAME="$1-$SLURM_TASK_PID"
shift
fi
......
#!/usr/bin/env bash
if [ -n "$1" ]; then
echo "$1"
if [ -n "$*" ]; then
echo "$*"
fi
SCRIPT_DIR=$( dirname "${BASH_SOURCE[0]}" )
echo "========================================"
echo " TASK INFO $( hostname ) ($( date ))"
echo "========================================"
echo "========================================================================"
echo " TASK INFO ($SLURM_TASK_PID): $( hostname ) ($( date ))"
echo "========================================================================"
$SCRIPT_DIR/computed_available_resources.sh
sleep 10s
echo ""
echo "========================================"
echo " TASK END $( hostname ) ($( date ))"
echo "========================================"
echo "========================================================================"
echo " TASK END ($SLURM_TASK_PID): $( hostname ) ($( date ))"
echo "========================================================================"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment