Update reservations for D2

17ed7b44 · Pfister, Martin · cb2ca028 · 17ed7b44 · 17ed7b44 · 17ed7b44
Commit 17ed7b44 authored 8 months ago by Pfister, Martin
--- a/D2_01_DDP_example.ipynb
+++ b/D2_01_DDP_example.ipynb
@@ -208,8 +208,8 @@
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "#SBATCH --reservation=s_tra_EUD20b\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",
@@ -393,8 +393,8 @@
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "#SBATCH --reservation=s_tra_EUD20b\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",

 %% Cell type:markdown id:d0b1f163-108f-4587-8b0d-0246158ee528 tags:
 # DDP example with Phi-3.5 mini instruct and openassistant-guanaco dataset
 In this example a network is trained on multiple GPUs with the help of DDP (Distributed Data Parallel). This approach allows to train networks that fit into the memory of a single GPU on multiple GPUs in parallel in order to speed up the training.
 If we want to use multiple GPUs, we need to write the code to a file and submit the job to the SLURM scheduler, because the JupyterHub that we are using today does not have access to any GPU. This example uses two GPUs on one node, but could be extended simply by adjusting the number of GPUs and nodes in the SLURM script.
 %% Cell type:markdown id:d7d9ee84-e29b-4c05-b124-50e735033760 tags:
 #### First, we write the python code to a file:
 %% Cell type:code id:565c4533-5104-4a7c-a688-8b6acb72e17d tags:
 ``` python
 %%writefile phi3_guanaco_ddp.py
 # Import libraries
 import torch
 from accelerate import PartialState
 from datasets import load_dataset
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer, SFTConfig
 import pynvml
 def print_gpu_utilization():
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    memory_used = []
    for device_index in range(device_count):
        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
        device_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        memory_used.append(device_info.used/1024**3)
    print('Memory occupied on GPUs: ' + ' + '.join([f'{mem:.1f}' for mem in memory_used]) + ' GB.')
 # Choose a model and load tokenizer and model (using 4bit quantization):
 model_name = '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/models/microsoft--phi-3.5-mini-instruct'
 # model_name = 'microsoft/Phi-3.5-mini-instruct'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.padding_side = 'right'
 # For multi-GPU training, find out how many GPUs there are and which one we should use:
 ps = PartialState()
 num_processes = ps.num_processes
 process_index = ps.process_index
 local_process_index = ps.local_process_index
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
    ),
    device_map={'':local_process_index},  # Changed for DDP
    attn_implementation='eager',  # 'eager', 'sdpa', or "flash_attention_2"
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
 )
 # Load the guanaco dataset
 guanaco_train = load_dataset('/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', split='train')
 guanaco_test = load_dataset('/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', split='test')
 # guanaco_train = load_dataset('timdettmers/openassistant-guanaco', split='train')
 # guanaco_test = load_dataset('timdettmers/openassistant-guanaco', split='test')
 guanaco_train = guanaco_train.map(lambda entry: {
    'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
    'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
 })
 guanaco_test = guanaco_test.map(lambda entry: {
    'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
    'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
 })
 guanaco_train = guanaco_train.map(lambda entry: {'messages': [
    {'role': 'user', 'content': entry['question1']},
    {'role': 'assistant', 'content': entry['answer1']}
 ]})
 guanaco_test = guanaco_test.map(lambda entry: {'messages': [
    {'role': 'user', 'content': entry['question1']},
    {'role': 'assistant', 'content': entry['answer1']}
 ]})
 model.config.use_cache = False  # KV cache can only speed up inference, but we are doing training.
 # Add low-rank adapters (LORA) to the model:
 peft_config = LoraConfig(
    task_type='CAUSAL_LM',
    r=16,
    lora_alpha=32,  # thumb rule: lora_alpha should be 2*r
    lora_dropout=0.05,
    bias='none',
    target_modules='all-linear',
 )
 model = get_peft_model(model, peft_config)
 training_arguments = SFTConfig(
    output_dir='output/phi-3.5-mini-instruct-guanaco-ddp',
    per_device_train_batch_size=8//num_processes,  # Adjust per-device batch size for DDP
    gradient_accumulation_steps=1,
    gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
        # e.g. Mistral 7B with PEFT using bitsandbytes:
        # - enabled: 11 GB GPU RAM and 8 samples/second
        # - disabled: 40 GB GPU RAM and 12 samples/second
    gradient_checkpointing_kwargs={'use_reentrant': False},  # Use newer implementation that will become the default.
    ddp_find_unused_parameters=False,  # Set to False when using gradient checkpointing to suppress warning message.
    log_level_replica='error',  # Disable warnings in all but the first process.
    optim='adamw_torch',
    learning_rate=2e-4,  # QLoRA suggestions: 2e-4 for 7B or 13B, 1e-4 for 33B or 65B
    logging_strategy='no',
    # logging_strategy='steps',  # 'no', 'epoch' or 'steps'
    # logging_steps=10,
    save_strategy='no',  # 'no', 'epoch' or 'steps'
    # save_steps=2000,
    # num_train_epochs=5,
    max_steps=100,
    bf16=True,  # mixed precision training
    report_to='none',  # disable wandb
    max_seq_length=1024,
 )
 def formatting_func(entry):
    return tokenizer.apply_chat_template(entry['messages'], tokenize=False)
 trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=guanaco_train,
    eval_dataset=guanaco_test,
    processing_class=tokenizer,
    formatting_func=formatting_func,
 )
 if process_index == 0:  # Only print in first process.
    if hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()
 eval_result = trainer.evaluate()
 if process_index == 0:
    print("Evaluation on test dataset before finetuning:")
    print(eval_result)
 train_result = trainer.train()
 if process_index == 0:
    print("Training result:")
    print(train_result)
 eval_result = trainer.evaluate()
 if process_index == 0:
    print("Evaluation on test dataset after finetuning:")
    print(eval_result)
 # Print memory usage once per node:
 if local_process_index == 0:
    print_gpu_utilization()
 # # Save model in first process only:
 # if process_index == 0:
 #     trainer.save_model()
 ```
 %% Output
    Overwriting phi3_guanaco_ddp.py
 %% Cell type:markdown id:a1769457-c82f-4954-89a5-7b3b47ed72cc tags:
 #### Next, we write a SLURM script (initially using 1 GPU only):
 %% Cell type:code id:c2d643dc-4e7f-4aad-a24a-4d80d8cb33c8 tags:
 ``` python
 %%writefile run_phi3_guanaco_1gpu.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
 # #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-#SBATCH --reservation=s_tra_EUD20b
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=1
 #SBATCH --gpus-per-task=1  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=120GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=8  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Run:
 time python3 phi3_guanaco_ddp.py
 ```
 %% Output
    Overwriting run_phi3_guanaco_1gpu.slurm
 %% Cell type:markdown id:4786f174-8231-4e1e-ae39-bff66ffccddc tags:
 #### We can now submit the SLURM script and, once the job ran, look at the output:
 %% Cell type:code id:8e8cc6fe-ec18-4856-b99a-e1e2f4f5ca86 tags:
 ``` python
 !sbatch run_phi3_guanaco_1gpu.slurm
 ```
 %% Output
    Submitted batch job 12957510
 %% Cell type:code id:7886c3e1-da04-49b9-bf9d-806083239ad9 tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12957510 boost_usr run_phi3 mpfister PD       0:00      1 (None)
              12952669 boost_usr jupyterl mpfister  R    3:50:30      1 lrdn3456
 %% Cell type:markdown id:e8fde657-6fdd-4629-b177-59dce5521e9e tags:
 Change the number in the command below to the JOBID of the batch job that you just submitted:
 %% Cell type:code id:34e5628f-b9cd-4d89-93c1-58cd811225e1 tags:
 ``` python
 !cat slurm-12957510.out
 ```
 %% Output
    + date
    Mon Feb 24 20:00:33 CET 2025
    + hostname
    lrdn1561.leonardo.local
    + nvidia-smi
    Mon Feb 24 20:00:33 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:1D:00.0 Off |                    0 |
    | N/A   43C    P0               63W / 476W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    + python3 phi3_guanaco_ddp.py
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.42s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-24 20:00:56,667] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    You are not running the flash-attention implementation, expect numerical differences.
    trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
    100%|██████████| 65/65 [00:28<00:00,  2.25it/s]
    Evaluation on test dataset before finetuning:
    {'eval_loss': 1.6385879516601562, 'eval_model_preparation_time': 0.0038, 'eval_runtime': 31.4062, 'eval_samples_per_second': 16.494, 'eval_steps_per_second': 2.07}
    100%|██████████| 100/100 [02:37<00:00,  1.57s/it]
    {'train_runtime': 157.321, 'train_samples_per_second': 5.085, 'train_steps_per_second': 0.636, 'train_loss': 1.1882408142089844, 'epoch': 0.08}
    Training result:
    TrainOutput(global_step=100, training_loss=1.1882408142089844, metrics={'train_runtime': 157.321, 'train_samples_per_second': 5.085, 'train_steps_per_second': 0.636, 'total_flos': 1.2286246663913472e+16, 'train_loss': 1.1882408142089844, 'epoch': 0.08123476848090982})
    100%|██████████| 65/65 [00:28<00:00,  2.26it/s]
    Evaluation on test dataset after finetuning:
    {'eval_loss': 1.2124745845794678, 'eval_model_preparation_time': 0.0038, 'eval_runtime': 29.4068, 'eval_samples_per_second': 17.615, 'eval_steps_per_second': 2.21, 'epoch': 0.08123476848090982}
    Memory occupied on GPUs: 16.9 GB.
    real	4m3.923s
    user	3m46.567s
    sys	0m6.340s
 %% Cell type:markdown id:c9519e17-ec88-4b56-ab1d-1fbc4cb15612 tags:
 #### Now, we write another SLURM script where use `torchrun` to train on multiple GPUs using DDP and submit the script to the scheduler again:
 %% Cell type:code id:7c012ab2-b6ad-4078-aa60-a37ada2c8012 tags:
 ``` python
 %%writefile run_phi3_guanaco_ddp.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
 # #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-#SBATCH --reservation=s_tra_EUD20b
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=1
 #SBATCH --gpus-per-task=2  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=240GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=16  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Set environment variables for communication between nodes:
 export MASTER_PORT=$(shuf -i 20000-30000 -n 1)  # Choose a random port
 export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
 # Set launcher and launcher arguments:
 export LAUNCHER="torchrun \
    --nnodes=$SLURM_JOB_NUM_NODES \
    --nproc_per_node=$SLURM_GPUS_ON_NODE \
    --rdzv_id=$SLURM_JOB_ID \
    --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
    --rdzv_backend=c10d"
 # Set training script that will be executed:
 export PROGRAM="phi3_guanaco_ddp.py"
 # Run:
 time srun bash -c "$LAUNCHER $PROGRAM"
 ```
 %% Output
    Overwriting run_phi3_guanaco_ddp.slurm
 %% Cell type:code id:c7a766c6-f7ec-49f6-a557-aa0b5d8ec357 tags:
 ``` python
 !sbatch run_phi3_guanaco_ddp.slurm
 ```
 %% Output
    Submitted batch job 12957512
 %% Cell type:code id:37fd35dd-3ba0-4627-a53c-ca6542de7b2f tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12957512 boost_usr run_phi3 mpfister PD       0:00      1 (None)
              12957510 boost_usr run_phi3 mpfister  R       0:10      1 lrdn1561
              12952669 boost_usr jupyterl mpfister  R    3:50:41      1 lrdn3456
 %% Cell type:code id:e2e2a4c2-e157-40cd-9bc6-8de53b7065e3 tags:
 ``` python
 !cat slurm-12957512.out
 ```
 %% Output
    + date
    Mon Feb 24 20:00:45 CET 2025
    + hostname
    lrdn2199.leonardo.local
    + nvidia-smi
    Mon Feb 24 20:00:45 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:8F:00.0 Off |                    0 |
    | N/A   42C    P0               62W / 461W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    |   1  NVIDIA A100-SXM-64GB            On | 00000000:C8:00.0 Off |                    0 |
    | N/A   43C    P0               62W / 460W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    ++ shuf -i 20000-30000 -n 1
    + export MASTER_PORT=22633
    + MASTER_PORT=22633
    ++ head -n 1
    ++ scontrol show hostnames lrdn2199
    + export MASTER_ADDR=lrdn2199
    + MASTER_ADDR=lrdn2199
    + export OMP_NUM_THREADS=16
    + OMP_NUM_THREADS=16
    + export 'LAUNCHER=torchrun     --nnodes=1     --nproc_per_node=2     --rdzv_id=12957512     --rdzv_endpoint=lrdn2199:22633     --rdzv_backend=c10d'
    + LAUNCHER='torchrun     --nnodes=1     --nproc_per_node=2     --rdzv_id=12957512     --rdzv_endpoint=lrdn2199:22633     --rdzv_backend=c10d'
    + export PROGRAM=phi3_guanaco_ddp.py
    + PROGRAM=phi3_guanaco_ddp.py
    + srun bash -c 'torchrun     --nnodes=1     --nproc_per_node=2     --rdzv_id=12957512     --rdzv_endpoint=lrdn2199:22633     --rdzv_backend=c10d phi3_guanaco_ddp.py'
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.94s/it]
    Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.43s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    [rank0]:[W224 20:01:08.468964634 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    [rank1]:[W224 20:01:08.506471860 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-24 20:01:08,362] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    [2025-02-24 20:01:08,392] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
    You are not running the flash-attention implementation, expect numerical differences.
    You are not running the flash-attention implementation, expect numerical differences.
    Evaluation on test dataset before finetuning:s]
    {'eval_loss': 1.630933165550232, 'eval_model_preparation_time': 0.0036, 'eval_runtime': 20.0255, 'eval_samples_per_second': 25.867, 'eval_steps_per_second': 1.648}
    100%|██████████| 33/33 [00:17<00:00,  1.88it/s]
     87%|████████▋ | 87/100 [01:16<00:11,  1.16it/{'train_runtime': 87.6153, 'train_samples_per_second': 9.131, 'train_steps_per_second': 1.141, 'train_loss': 1.1826993560791015, 'epoch': 0.08}
    100%|██████████| 100/100 [01:27<00:00,  1.14it/s]
    Training result:
    TrainOutput(global_step=100, training_loss=1.1826993560791015, metrics={'train_runtime': 87.6153, 'train_samples_per_second': 9.131, 'train_steps_per_second': 1.141, 'total_flos': 1.0099122749046784e+16, 'train_loss': 1.1826993560791015, 'epoch': 0.08123476848090982})
    Evaluation on test dataset after finetuning:/s]
    {'eval_loss': 1.2069200277328491, 'eval_model_preparation_time': 0.0036, 'eval_runtime': 18.1952, 'eval_samples_per_second': 28.469, 'eval_steps_per_second': 1.814, 'epoch': 0.08123476848090982}
    100%|██████████| 33/33 [00:17<00:00,  1.88it/s]
    Memory occupied on GPUs: 19.8 + 16.2 GB.
    real	2m31.132s
    user	0m0.265s
    sys	0m0.009s
 %% Cell type:markdown id:e102f7d9-ca9f-486e-937e-c3ee3a09fc40 tags:
 #### Finally, we can clean up and delete the files that we just created:
 %% Cell type:code id:6a52fe7c-9bfe-45d0-84ed-9287c9c84f0c tags:
 ``` python
 !rm phi3_guanaco_ddp.py run_phi3_guanaco_1gpu.slurm run_phi3_guanaco_ddp.slurm slurm-*.out
 ```
 %% Cell type:code id:e15b7d0b-924a-46f1-a23f-42ee5964dfec tags:
 ``` python
 ```
 %% Cell type:code id:24fdaa48-f908-4b04-9132-681abc3a7ca2 tags:
 ``` python
 ```
 %% Cell type:markdown id:0a77a1b6-4241-48f1-8cfe-85b202e3d7c0 tags:
 ### Summary
 DDP allows to speed up training through the use of multiple GPUs for models that fit the memory of a single GPU.
 | Number of GPUs used | Training time |
 | - | - |
 | 1 GPU | ? |
 | 2 GPUs | ? |
 | 4 GPUs | ? |
 | 8 GPUs (2 nodes) | ? |
 %% Cell type:code id:53ec53ea-9e08-48d0-9b90-b3212543914d tags:
 ``` python
 ```

--- a/D2_02_HF_Accelerate.ipynb
+++ b/D2_02_HF_Accelerate.ipynb
@@ -226,8 +226,8 @@
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "#SBATCH --reservation=s_tra_EUD20b\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",
@@ -453,9 +453,9 @@
    "#!/bin/bash\n",
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
-    "#SBATCH --qos=boost_qos_dbg\n",
+    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "# #SBATCH --reservation=s_tra_EUD20\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",
@@ -697,9 +697,9 @@
    "#!/bin/bash\n",
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
-    "#SBATCH --qos=boost_qos_dbg\n",
+    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "# #SBATCH --reservation=s_tra_EUD20\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",

 %% Cell type:markdown id:5fb8d0c9-ab40-455a-b6d8-9478c9ffb2e0 tags:
 # Huggingface Accelerate
 In this notebook, we are going to write a Python and SLURM script directly from within cells and also launch the SLURM script.
 Since we only have limited ressources, we are going to use a small dateset in combination with a small model. However, this still demonstrates how to use Huggingface's [Accelerate](https://huggingface.co/docs/accelerate/index) library to perform distributed training on multiple GPUs across multiple nodes. In our case, we are using 2 nodes with 2 NVIDIA A100 GPUs each.
 The network used on LEONARDO is NVLink.
 Hugging Face Accelerate simplifies distributed training with the key benefits being:
 - Automated Distributed Setup: Accelerate automatically initializes the distributed environment. It configures process groups, sets the appropriate environment variables, and assigns GPUs to processes using PyTorch's native Distributed Data Parallel (DDP). This means you don’t have to manually set up multi-GPU execution or write boilerplate code.
 - Device and Process Management: With utilities like PartialState, Accelerate provides easy access to details such as the number of processes, process indices, and local device assignments. This information is crucial for tasks like sharding data, adjusting batch sizes per GPU, and ensuring that only one process handles logging or model saving.
 - Seamless Scaling: Accelerate allows your code to run on both single and multiple GPUs with minimal modifications. Whether you're training on one GPU or several, Accelerate handles the synchronization of model parameters and gradients across devices, making your code more portable and scalable.
 #### Python script
 Let's go through the Python script, so you know what we are launching here. <br>
 Accelerate doesn't mandate that the code be wrapped in a main() function, but it is highly recommended—especially for distributed or multi-GPU setups. Here’s why:
 - Multiprocessing Safety:
    When using distributed training, processes are spawned that import your script. By wrapping your code in a main() function and using the if __name__ == "__main__": guard, you prevent unintended code execution in child processes.
 - Accelerate Configuration:
 The Accelerate config file often includes an entry like main_training_function: main. This instructs Accelerate to look for a function named main to kick off training. If you don’t define it, you might run into errors.
 %% Cell type:code id:61c26ccb-8f0b-4b05-ae73-96739c07215f tags:
 ``` python
 %%writefile phi3_guanaco_accelerate.py
 import torch
 from accelerate import Accelerator
 from datasets import load_dataset
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer, SFTConfig
 import pynvml
 def print_gpu_utilization():
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    memory_used = []
    for device_index in range(device_count):
        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
        device_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        memory_used.append(device_info.used / 1024**3)
    print('Memory occupied on GPUs: ' + ' + '.join([f'{mem:.1f}' for mem in memory_used]) + ' GB.')
 def main():
    # Initialize Accelerator; it will auto-detect the distributed environment from SLURM
    accelerator = Accelerator()
    device = accelerator.device
    if accelerator.is_main_process:
        print(f"Running on device: {device}")
    # Define model name and load tokenizer
    model_name = '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/models/microsoft--phi-3.5-mini-instruct'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = 'right'
    # Load the model with 4-bit quantization. Note that we do not specify a device map manually.
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16,
        ),
        attn_implementation='eager',
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )
    # Move the model to the device specified by Accelerator
    model.to(device)
    # Disable caching (only beneficial for inference)
    model.config.use_cache = False
    # Add LoRA adapters
    peft_config = LoraConfig(
        task_type='CAUSAL_LM',
        r=16,
        lora_alpha=32,       # rule of thumb: lora_alpha should be about 2 * r
        lora_dropout=0.05,
        bias='none',
        target_modules='all-linear',
    )
    model = get_peft_model(model, peft_config)
    # Load and preprocess the dataset
    guanaco_train = load_dataset(
        '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco',
        split='train'
    )
    guanaco_test = load_dataset(
        '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco',
        split='test'
    )
    # Process each example to extract the user prompt and assistant response
    guanaco_train = guanaco_train.map(lambda entry: {
        'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
        'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
    })
    guanaco_test = guanaco_test.map(lambda entry: {
        'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
        'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
    })
    # Restructure to a chat format expected by our formatting function
    guanaco_train = guanaco_train.map(lambda entry: {'messages': [
        {'role': 'user', 'content': entry['question1']},
        {'role': 'assistant', 'content': entry['answer1']}
    ]})
    guanaco_test = guanaco_test.map(lambda entry: {'messages': [
        {'role': 'user', 'content': entry['question1']},
        {'role': 'assistant', 'content': entry['answer1']}
    ]})
    # Define training arguments with SFTConfig.
    # Note: We use accelerator.num_processes to adjust the per-device batch size.
    training_arguments = SFTConfig(
        output_dir='output/phi-3.5-mini-instruct-guanaco-ddp',
        per_device_train_batch_size=8 // accelerator.num_processes,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant': False},
        ddp_find_unused_parameters=False,
        log_level_replica='error',
        optim='adamw_torch',
        learning_rate=2e-4,
        logging_strategy='no',
        save_strategy='no',
        max_steps=100,
        bf16=True,
        report_to='none',
        max_seq_length=1024,
    )
    def formatting_func(entry):
        return tokenizer.apply_chat_template(entry['messages'], tokenize=False)
    # Create the SFTTrainer.
    trainer = SFTTrainer(
        model=model,
        args=training_arguments,
        train_dataset=guanaco_train,
        eval_dataset=guanaco_test,
        processing_class=tokenizer,
        formatting_func=formatting_func,
    )
    # Optionally print trainable parameters on the main process only.
    if accelerator.is_main_process and hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()
    # Evaluate before training
    eval_result = trainer.evaluate()
    if accelerator.is_main_process:
        print("Evaluation on test dataset before finetuning:")
        print(eval_result)
    # Train the model
    train_result = trainer.train()
    if accelerator.is_main_process:
        print("Training result:")
        print(train_result)
    # Evaluate after training
    eval_result = trainer.evaluate()
    if accelerator.is_main_process:
        print("Evaluation on test dataset after finetuning:")
        print(eval_result)
    # Print GPU memory usage (only once per node)
    if accelerator.local_process_index == 0:
        print_gpu_utilization()
 if __name__ == "__main__":
    main()
 ```
 %% Output
    Writing phi3_guanaco_accelerate.py
 %% Cell type:markdown id:f98a9e9e-74d8-49ca-b0c2-0fbf9c90318d tags:
 #### Next, we write a SLURM script, initially using 1 GPU only and the exact same setup as with the DDP example:
 While this code works and runs, it is handled differently, than when launching it with Accelerate. Note the times in the output.
 %% Cell type:code id:3d106c10-5d66-4fa1-889c-2db3c01581d0 tags:
 ``` python
 %%writefile run_phi3_guanaco_accelerate_1gpu.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
 # #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-#SBATCH --reservation=s_tra_EUD20b
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=1
 #SBATCH --gpus-per-task=1  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=120GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=8  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Run:
 time python3 phi3_guanaco_accelerate.py
 ```
 %% Output
    Overwriting accelerate_run_phi3_guanaco_1gpu.slurm
 %% Cell type:markdown id:2a09f4af-dd63-4aa5-8df4-fa540d721ad3 tags:
 #### We can now submit the SLURM script and, once the job ran, look at the output:
 %% Cell type:code id:4d42590c-ccbb-484f-8811-524a238e7d74 tags:
 ``` python
 !sbatch run_phi3_guanaco_accelerate_1gpu.slurm
 ```
 %% Output
    Submitted batch job 12961431
 %% Cell type:code id:21f3ff05-f7ba-4bee-be3a-00f3566c647b tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12960270 boost_usr jupyter. sharriso  R    2:05:38      1 lrdn0151
              12961431 boost_usr accelera sharriso  R       0:30      1 lrdn0675
 %% Cell type:markdown id:f26c5156-e8bf-405c-a360-ba6721e5b35e tags:
 Change the number in the command below to the JOBID of the batch job that you just submitted:
 %% Cell type:code id:3ba8f461-306c-412c-a65b-4bd203c5e006 tags:
 ``` python
 !cat slurm-12961431.out
 ```
 %% Output
    + date
    Tue Feb 25 00:06:29 CET 2025
    + hostname
    lrdn0675.leonardo.local
    + nvidia-smi
    Tue Feb 25 00:06:29 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:1D:00.0 Off |                    0 |
    | N/A   43C    P0               64W / 475W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    + python3 phi3_guanaco_accelerate.py
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    `low_cpu_mem_usage` was None, now default to True since model is quantized.
    Running on device: cuda
    Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.30s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Generating train split: 9846 examples [00:00, 147383.38 examples/s]
    Generating test split: 518 examples [00:00, 136258.98 examples/s]
    Repo card metadata block was not found. Setting CardData to empty.
    Map: 100%|██████████| 9846/9846 [00:00<00:00, 34352.20 examples/s]
    Map: 100%|██████████| 518/518 [00:00<00:00, 26097.26 examples/s]
    Map: 100%|██████████| 9846/9846 [00:00<00:00, 24355.27 examples/s]
    Map: 100%|██████████| 518/518 [00:00<00:00, 19854.97 examples/s]
    Map: 100%|██████████| 9846/9846 [00:01<00:00, 6866.43 examples/s]
    Map: 100%|██████████| 518/518 [00:00<00:00, 6257.20 examples/s]
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-25 00:06:58,774] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    df: /leonardo/home/userexternal/sharriso/.triton/autotune: No such file or directory
    You are not running the flash-attention implementation, expect numerical differences.
    trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
    100%|██████████| 65/65 [00:28<00:00,  2.27it/s]
    Evaluation on test dataset before finetuning:
    {'eval_loss': 1.6385879516601562, 'eval_model_preparation_time': 0.0037, 'eval_runtime': 31.0606, 'eval_samples_per_second': 16.677, 'eval_steps_per_second': 2.093}
    100%|██████████| 100/100 [02:32<00:00,  1.53s/it]
    {'train_runtime': 152.9387, 'train_samples_per_second': 5.231, 'train_steps_per_second': 0.654, 'train_loss': 1.1880614471435547, 'epoch': 0.08}
    Training result:
    TrainOutput(global_step=100, training_loss=1.1880614471435547, metrics={'train_runtime': 152.9387, 'train_samples_per_second': 5.231, 'train_steps_per_second': 0.654, 'total_flos': 1.2286246663913472e+16, 'train_loss': 1.1880614471435547, 'epoch': 0.08123476848090982})
    100%|██████████| 65/65 [00:28<00:00,  2.27it/s]
    Evaluation on test dataset after finetuning:
    {'eval_loss': 1.212457299232483, 'eval_model_preparation_time': 0.0037, 'eval_runtime': 29.2657, 'eval_samples_per_second': 17.7, 'eval_steps_per_second': 2.221, 'epoch': 0.08123476848090982}
    Memory occupied on GPUs: 16.9 GB.
    real	4m5.340s
    user	3m50.963s
    sys	0m6.779s
 %% Cell type:markdown id:fb511eed-6add-4df7-b6f5-09e14a1e6b28 tags:
 #### Now, we will create an Accelerate config file and adapt the SLURM script, so that Accelerate launches the program. We are still only using 1 GPU:
 Note the times in the output again!
 %% Cell type:code id:447a57d4-7c26-4833-b310-88b6b356bf11 tags:
 ``` python
 %%writefile accelerate_default_config_1gpu.yaml
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: NO
 mixed_precision: bf16
 downcast_bf16: 'yes'
 machine_rank: 0
 main_training_function: main
 num_machines: 1
 num_processes: 1
 rdzv_backend: static
 same_network: true
 use_cpu: false
 ```
 %% Output
    Writing ./tooling/config/accelerate_default_config_1gpu.yaml
 %% Cell type:code id:367bc657-1aef-4ebc-8902-e6302c6759dc tags:
 ``` python
 %%writefile run_phi3_guanaco_accelerate_1gpu.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
-#SBATCH --qos=boost_qos_dbg
+# #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-# #SBATCH --reservation=s_tra_EUD20
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=1
 #SBATCH --gpus-per-task=1  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=120GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=8  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Set environment variables for communication between nodes:
 export MASTER_PORT=$(shuf -i 20000-30000 -n 1)  # Choose a random port
 export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
 # Set launcher and launcher arguments:
 export LAUNCHER="accelerate launch \
    --num_machines $SLURM_NNODES \
    --num_processes $((SLURM_NNODES * SLURM_GPUS_ON_NODE)) \
    --num_cpu_threads_per_process 8 \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --machine_rank \$SLURM_PROCID \
    --config_file \"accelerate_default_config_1gpu.yaml\" \
    "
 # Set training script that will be executed:
 export PROGRAM="phi3_guanaco_accelerate.py"
 # Run:
 time srun bash -c "$LAUNCHER $PROGRAM"
 ```
 %% Output
    Overwriting run_phi3_guanaco_accelerate_1gpu.slurm
 %% Cell type:markdown id:9bc1d528-e9d5-4a25-a32a-3c2136237d79 tags:
 #### We can now execute the SLURM script and, once the job ran, look at the output:
 %% Cell type:code id:879b0935-a1be-4b40-835d-154a9ab1e537 tags:
 ``` python
 !sbatch run_phi3_guanaco_accelerate_1gpu.slurm
 ```
 %% Output
    Submitted batch job 12961598
 %% Cell type:code id:b2084719-fd83-49bf-a7d6-5674b8e9fed8 tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12961598 boost_usr run_phi3 sharriso  R       0:31      1 lrdn0151
              12960270 boost_usr jupyter. sharriso  R    2:50:54      1 lrdn0151
 %% Cell type:code id:2ee77873-5768-4543-b4a1-2207156cc423 tags:
 ``` python
 !cat slurm-12961598.out
 ```
 %% Output
    + date
    Tue Feb 25 00:51:44 CET 2025
    + hostname
    lrdn0151.leonardo.local
    + nvidia-smi
    Tue Feb 25 00:51:44 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:1D:00.0 Off |                    0 |
    | N/A   42C    P0               63W / 479W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    ++ shuf -i 20000-30000 -n 1
    + export MASTER_PORT=22153
    + MASTER_PORT=22153
    ++ scontrol show hostnames lrdn0151
    ++ head -n 1
    + export MASTER_ADDR=lrdn0151
    + MASTER_ADDR=lrdn0151
    + export OMP_NUM_THREADS=8
    + OMP_NUM_THREADS=8
    + export 'LAUNCHER=accelerate launch     --num_machines 1     --num_processes 1     --num_cpu_threads_per_process 8     --main_process_ip lrdn0151     --main_process_port 22153     --machine_rank $SLURM_PROCID     --config_file "./tooling/config/accelerate_default_config_1gpu.yaml"     '
    + LAUNCHER='accelerate launch     --num_machines 1     --num_processes 1     --num_cpu_threads_per_process 8     --main_process_ip lrdn0151     --main_process_port 22153     --machine_rank $SLURM_PROCID     --config_file "./tooling/config/accelerate_default_config_1gpu.yaml"     '
    + export PROGRAM=phi3_guanaco_accelerate.py
    + PROGRAM=phi3_guanaco_accelerate.py
    + srun bash -c 'accelerate launch     --num_machines 1     --num_processes 1     --num_cpu_threads_per_process 8     --main_process_ip lrdn0151     --main_process_port 22153     --machine_rank $SLURM_PROCID     --config_file "./tooling/config/accelerate_default_config_1gpu.yaml"      phi3_guanaco_accelerate.py'
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    `low_cpu_mem_usage` was None, now default to True since model is quantized.
    Running on device: cuda
    Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.45s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Map: 100%|██████████| 518/518 [00:00<00:00, 2380.10 examples/s]
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-25 00:52:22,253] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    You are not running the flash-attention implementation, expect numerical differences.
    trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
    100%|██████████| 65/65 [00:28<00:00,  2.25it/s]
    Evaluation on test dataset before finetuning:
    {'eval_loss': 1.6385879516601562, 'eval_model_preparation_time': 0.0038, 'eval_runtime': 31.3092, 'eval_samples_per_second': 16.545, 'eval_steps_per_second': 2.076}
    100%|██████████| 100/100 [02:41<00:00,  1.62s/it]
    {'train_runtime': 161.9574, 'train_samples_per_second': 4.94, 'train_steps_per_second': 0.617, 'train_loss': 1.1879594421386719, 'epoch': 0.08}
    Training result:
    TrainOutput(global_step=100, training_loss=1.1879594421386719, metrics={'train_runtime': 161.9574, 'train_samples_per_second': 4.94, 'train_steps_per_second': 0.617, 'total_flos': 1.2286246663913472e+16, 'train_loss': 1.1879594421386719, 'epoch': 0.08123476848090982})
    100%|██████████| 65/65 [00:28<00:00,  2.26it/s]
    Evaluation on test dataset after finetuning:
    {'eval_loss': 1.2126834392547607, 'eval_model_preparation_time': 0.0038, 'eval_runtime': 29.4553, 'eval_samples_per_second': 17.586, 'eval_steps_per_second': 2.207, 'epoch': 0.08123476848090982}
    Memory occupied on GPUs: 16.9 GB.
    real	4m23.740s
    user	0m0.267s
    sys	0m0.011s
 %% Cell type:markdown id:e4f1e520-75c5-4c47-bedc-e586366d0fc8 tags:
 #### Now, we write another config file and SLURM script to train on multiple GPUs using Accelerate and submit the script to the scheduler again:
 %% Cell type:code id:31027442-ae73-44ee-95f5-e1d0f0214e4a tags:
 ``` python
 %%writefile accelerate_default_config.yaml
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: MULTI_GPU
 mixed_precision: bf16
 downcast_bf16: 'yes'
 machine_rank: 0
 main_training_function: main
 num_machines: 2
 num_processes: 4
 rdzv_backend: static
 same_network: true
 use_cpu: false
 ```
 %% Output
    Overwriting ./tooling/config/accelerate_default_config.yaml
 %% Cell type:code id:03807fe7-0a2c-470c-849c-7e0b887f06f1 tags:
 ``` python
 %%writefile run_phi3_guanaco_accelerate_multigpu.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
-#SBATCH --qos=boost_qos_dbg
+# #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-# #SBATCH --reservation=s_tra_EUD20
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=2
 #SBATCH --gpus-per-task=2  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=120GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=16  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Set environment variables for communication between nodes:
 export MASTER_PORT=$(shuf -i 20000-30000 -n 1)  # Choose a random port
 export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
 # Set launcher and launcher arguments:
 export LAUNCHER="accelerate launch \
    --num_machines $SLURM_NNODES \
    --num_processes $((SLURM_NNODES * SLURM_GPUS_ON_NODE)) \
    --num_cpu_threads_per_process 8 \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --machine_rank \$SLURM_PROCID \
    --config_file \"accelerate_default_config.yaml\" \
    "
 # Set training script that will be executed:
 export PROGRAM="phi3_guanaco_accelerate.py"
 # Run:
 time srun bash -c "$LAUNCHER $PROGRAM"
 ```
 %% Output
    Writing run_phi3_guanaco_accelerate_multigpu.slurm
 %% Cell type:code id:b7b77016-c158-4f30-b724-ba641947bb3e tags:
 ``` python
 !sbatch run_phi3_guanaco_accelerate_multigpu.slurm
 ```
 %% Output
    Submitted batch job 12961611
 %% Cell type:code id:0bf67e19-dc5b-4281-aa21-f4ac41dec56b tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12961611 boost_usr run_phi3 sharriso  R       0:06      2 lrdn[1065-1066]
              12960270 boost_usr jupyter. sharriso  R    3:07:15      1 lrdn0151
 %% Cell type:code id:52797ca8-b066-44f3-8e0f-a57f6b8a5504 tags:
 ``` python
 !cat slurm-12961611.out
 ```
 %% Output
    + date
    Tue Feb 25 01:08:31 CET 2025
    + hostname
    lrdn1065.leonardo.local
    + nvidia-smi
    Tue Feb 25 01:08:31 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:1D:00.0 Off |                    0 |
    | N/A   42C    P0               64W / 485W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    |   1  NVIDIA A100-SXM-64GB            On | 00000000:56:00.0 Off |                    0 |
    | N/A   43C    P0               63W / 469W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    ++ shuf -i 20000-30000 -n 1
    + export MASTER_PORT=27595
    + MASTER_PORT=27595
    ++ head -n 1
    ++ scontrol show hostnames 'lrdn[1065-1066]'
    + export MASTER_ADDR=lrdn1065
    + MASTER_ADDR=lrdn1065
    + export OMP_NUM_THREADS=16
    + OMP_NUM_THREADS=16
    + export 'LAUNCHER=accelerate launch     --num_machines 2     --num_processes 4     --num_cpu_threads_per_process 8     --main_process_ip lrdn1065     --main_process_port 27595     --machine_rank $SLURM_PROCID     --config_file "./tooling/config/accelerate_default_config.yaml"     '
    + LAUNCHER='accelerate launch     --num_machines 2     --num_processes 4     --num_cpu_threads_per_process 8     --main_process_ip lrdn1065     --main_process_port 27595     --machine_rank $SLURM_PROCID     --config_file "./tooling/config/accelerate_default_config.yaml"     '
    + export PROGRAM=phi3_guanaco_accelerate.py
    + PROGRAM=phi3_guanaco_accelerate.py
    + srun bash -c 'accelerate launch     --num_machines 2     --num_processes 4     --num_cpu_threads_per_process 8     --main_process_ip lrdn1065     --main_process_port 27595     --machine_rank $SLURM_PROCID     --config_file "./tooling/config/accelerate_default_config.yaml"      phi3_guanaco_accelerate.py'
    Running on device: cuda:0
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    `low_cpu_mem_usage` was None, now default to True since model is quantized.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    `low_cpu_mem_usage` was None, now default to True since model is quantized.
    Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    `low_cpu_mem_usage` was None, now default to True since model is quantized.
    Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    `low_cpu_mem_usage` was None, now default to True since model is quantized.
    Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.41s/it]
    Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  2.00s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.77s/it]
    Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.33s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    [rank1]:[W225 01:09:06.024495300 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    [rank0]:[W225 01:09:06.167706885 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    [rank3]:[W225 01:09:06.088886393 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    [rank2]:[W225 01:09:06.185924046 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-25 01:09:07,129] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    [2025-02-25 01:09:07,129] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    [2025-02-25 01:09:07,152] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    [2025-02-25 01:09:07,152] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
    You are not running the flash-attention implementation, expect numerical differences.
    You are not running the flash-attention implementation, expect numerical differences.
    You are not running the flash-attention implementation, expect numerical differences.
    You are not running the flash-attention implementation, expect numerical differences.
    Evaluation on test dataset before finetuning:
    {'eval_loss': 1.630933165550232, 'eval_model_preparation_time': 0.0037, 'eval_runtime': 12.4764, 'eval_samples_per_second': 41.518, 'eval_steps_per_second': 1.363}
    100%|██████████| 17/17 [00:10<00:00,  1.64it/s]
     87%|████████▋ | 87/100 [00:45<00:06,  2.01it/{'train_runtime': 52.0472, 'train_samples_per_second': 15.371, 'train_steps_per_second': 1.921, 'train_loss': 1.2388738250732423, 'epoch': 0.08}
    100%|██████████| 100/100 [00:51<00:00,  1.92it/s]
    Training result:
    TrainOutput(global_step=100, training_loss=1.2388738250732423, metrics={'train_runtime': 52.0472, 'train_samples_per_second': 15.371, 'train_steps_per_second': 1.921, 'total_flos': 7775281301422080.0, 'train_loss': 1.2388738250732423, 'epoch': 0.08123476848090982})
    100%|██████████| 17/17 [00:10<00:00,  1.67it/s]
    Evaluation on test dataset after finetuning:
    {'eval_loss': 1.2084568738937378, 'eval_model_preparation_time': 0.0037, 'eval_runtime': 10.8504, 'eval_samples_per_second': 47.74, 'eval_steps_per_second': 1.567, 'epoch': 0.08123476848090982}
    Memory occupied on GPUs: 18.1 + 17.0 GB.
    Memory occupied on GPUs: 11.3 + 14.1 GB.
    real	1m54.808s
    user	0m0.265s
    sys	0m0.010s
 %% Cell type:markdown id:8bbfef9e-0d43-41d0-ac18-954d73e7075c tags:
 #### Before we close the notebook, we should clean up the files created:
 %% Cell type:code id:aab5d70f-70a6-4ee0-a080-c51bd0358dbb tags:
 ``` python
 !rm phi3_guanaco_accelerate.py run_phi3_guanaco_accelerate_1gpu.slurm run_phi3_guanaco_accelerate_multigpu.slurm slurm-*.out *.yaml
 ```
 %% Output
    rm: cannot remove '*.yaml': No such file or directory
 %% Cell type:code id:c9452e84-1d1d-4669-a688-9cdaf25daf6a tags:
 ``` python
 ```

--- a/D2_03_DeepSpeed.ipynb
+++ b/D2_03_DeepSpeed.ipynb
@@ -246,8 +246,8 @@
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "#SBATCH --reservation=s_tra_EUD20b\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",

 %% Cell type:markdown id:fa76d21e-5f67-4405-8de3-048673db1b71 tags:
 # ZeRO with Deepspeed
 %% Cell type:code id:0e7333b1-a904-433c-a75a-d5966568adba tags:
 ``` python
 %%writefile phi3_guanaco_accelerate_deepspeed.py
 import torch
 from accelerate import Accelerator
 from datasets import load_dataset
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer, SFTConfig
 import pynvml
 import deepspeed
 def print_gpu_utilization():
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    memory_used = []
    for device_index in range(device_count):
        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
        device_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        memory_used.append(device_info.used / 1024**3)
    print('Memory occupied on GPUs: ' + ' + '.join([f'{mem:.1f}' for mem in memory_used]) + ' GB.')
 def main():
    # Initialize Accelerator; its configuration (including DeepSpeed) is loaded from the config file.
    accelerator = Accelerator()
    device = accelerator.device
    if accelerator.is_main_process:
        print(f"Running on device: {device}")
    # Define model name and load tokenizer.
    model_name = '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/models/microsoft--phi-3.5-mini-instruct'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = 'right'
    # Load the model with 4-bit quantization.
    # Note: We no longer specify a manual device map because DeepSpeed (via. Accelerate) will handle device placement.
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_storage=torch.bfloat16,
        ),
        attn_implementation='eager',
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )
    # For DeepSpeed integration, you can remove or comment out the following:
    # model.to(device)
    # Disable caching (KV cache is only useful during inference).
    #model.config.use_cache = False
    # Add LoRA adapters.
    peft_config = LoraConfig(
        task_type='CAUSAL_LM',
        r=16,
        lora_alpha=32,       # rule of thumb: lora_alpha should be about 2 * r
        lora_dropout=0.05,
        bias='none',
        target_modules='all-linear',
    )
    model = get_peft_model(model, peft_config)
    # Load and preprocess the dataset.
    guanaco_train = load_dataset(
        '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco',
        split='train'
    )
    guanaco_test = load_dataset(
        '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco',
        split='test'
    )
    # Process each example to extract the user prompt and assistant response.
    guanaco_train = guanaco_train.map(lambda entry: {
        'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
        'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
    })
    guanaco_test = guanaco_test.map(lambda entry: {
        'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
        'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
    })
    # Restructure to a chat format expected by our formatting function.
    guanaco_train = guanaco_train.map(lambda entry: {'messages': [
        {'role': 'user', 'content': entry['question1']},
        {'role': 'assistant', 'content': entry['answer1']}
    ]})
    guanaco_test = guanaco_test.map(lambda entry: {'messages': [
        {'role': 'user', 'content': entry['question1']},
        {'role': 'assistant', 'content': entry['answer1']}
    ]})
    # Define training arguments.
    # Here, we add the `deepspeed` parameter so that the Trainer will use DeepSpeed.
    training_arguments = SFTConfig(
        output_dir='output/phi-3.5-mini-instruct-guanaco-deepspeed',
        #per_device_train_batch_size=8,
        #gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        #gradient_checkpointing_kwargs={'use_reentrant': False},
        optim='adamw_torch',
        learning_rate=2e-4, # QLoRA suggestions: 2e-4 for 7B or 13B, 1e-4 for 33B or 65B
        logging_strategy='no',
        save_strategy='no',
        max_steps=100,
        bf16=True,
        report_to='none',
        max_seq_length=1024,
    )
    def formatting_func(entry):
        return tokenizer.apply_chat_template(entry['messages'], tokenize=False)
    # Create the SFTTrainer.
    trainer = SFTTrainer(
        model=model,
        args=training_arguments,
        train_dataset=guanaco_train,
        eval_dataset=guanaco_test,
        processing_class=tokenizer,
        formatting_func=formatting_func,
    )
    # Optionally print trainable parameters on the main process only.
    if accelerator.is_main_process and hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()
    # Evaluate before training.
    eval_result = trainer.evaluate()
    if accelerator.is_main_process:
        print("Evaluation on test dataset before finetuning:")
        print(eval_result)
    # Train the model.
    train_result = trainer.train()
    if accelerator.is_main_process:
        print("Training result:")
        print(train_result)
    # Evaluate after training.
    eval_result = trainer.evaluate()
    if accelerator.is_main_process:
        print("Evaluation on test dataset after finetuning:")
        print(eval_result)
    # Print GPU memory usage (only once per node).
    if accelerator.local_process_index == 0:
        print_gpu_utilization()
 if __name__ == "__main__":
    main()
 ```
 %% Output
    Overwriting phi3_guanaco_accelerate_deepspeed.py
 %% Cell type:code id:37973d74-77c0-48a5-bf5b-d66b816d18b4 tags:
 ``` python
 # Inline DeepSpeed configuration meant to roughly correspond to your FSDP settings
 # Use bf16 for mixed precision, similar to mixed_precision: bf16 in the FSDP config.
 # Optimizer settings (you can adjust these as needed, or remove if not required):
 # Use ZeRO optimization stage 3 to mimic FSDP's full sharding:
 # Do not offload parameters (fsdp_offload_params: false)
 # These options are chosen to be simple; they differ from FSDP’s wrapping or prefetching policies.
 ```
 %% Cell type:code id:08d90978-c5a0-4bc3-840c-d0c272474868 tags:
 ``` python
 %%writefile accelerate_deepspeed_config.yaml
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 deepspeed_config:
    bf16: true
    zero_stage: 3
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 2
 num_processes: 4
 rdzv_backend: c10d
 same_network: true
 use_cpu: false
 ```
 %% Output
    Overwriting accelerate_deepspeed_config.yaml
 %% Cell type:code id:f9cdaa92-0a89-46fc-9f99-fed07f005d60 tags:
 ``` python
 %%writefile run_phi3_guanaco_accelerate_deepspeed.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
 # #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-#SBATCH --reservation=s_tra_EUD20b
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=2
 #SBATCH --gpus-per-task=2  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=120GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=16  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Set environment variables for communication between nodes:
 export MASTER_PORT=$(shuf -i 20000-30000 -n 1)  # Choose a random port
 export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
 # Set launcher and launcher arguments:
 export LAUNCHER="accelerate launch \
    --num_machines $SLURM_NNODES \
    --num_processes $((SLURM_NNODES * SLURM_GPUS_ON_NODE/2)) \
    --num_cpu_threads_per_process 8 \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --machine_rank \$SLURM_PROCID \
    --config_file \"./accelerate_deepspeed_config.yaml\" \
    "
 # Set training script that will be executed:
 export PROGRAM="phi3_guanaco_accelerate_deepspeed.py"
 # Run:
 time srun bash -c "$LAUNCHER $PROGRAM"
 ```
 %% Output
    Overwriting run_phi3_guanaco_accelerate_deepspeed.slurm
 %% Cell type:markdown id:2b5b1e49-ddc0-4af9-879b-4ec276215b61 tags:
 #### We can now execute the SLURM script and, once the job ran, look at the output:
 %% Cell type:code id:d57e8f1c-65da-4475-8025-2323f512e6cd tags:
 ``` python
 !sbatch run_phi3_guanaco_accelerate_deepspeed.slurm
 ```
 %% Output
    Submitted batch job 12962050
 %% Cell type:code id:09922169-bea3-4439-8cd5-bfb84d0f17bc tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12962050 boost_usr run_phi3 sharriso  R       0:58      2 lrdn[0121-0122]
              12961675 boost_usr jupyter. sharriso  R    2:32:37      1 lrdn0135
 %% Cell type:code id:e0292ef6-43d2-4a7c-a391-02f9f431dcaa tags:
 ``` python
 !cat slurm-.out
 ```
 %% Cell type:markdown id:a697b550-e30f-471c-879d-8376825f0f5a tags:
 #### Before we close the notebook, we should clean up the files created:
 %% Cell type:code id:f03d5607-4f18-4579-9488-bc202107671c tags:
 ``` python
 !rm phi3_guanaco_accelerate_deepspeed.py run_phi3_guanaco_accelerate_deepspeed.slurm slurm-*.out accelerate_deepspeed_config*.yaml deepspeed_config*.json
 ```
 %% Output
    rm: cannot remove '*.yaml': No such file or directory

--- a/D2_04_FSDP_example.ipynb
+++ b/D2_04_FSDP_example.ipynb
@@ -253,8 +253,8 @@
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "#SBATCH --reservation=s_tra_EUD20b\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",

 %% Cell type:markdown id:d0b1f163-108f-4587-8b0d-0246158ee528 tags:
 # FSDP example with Phi-3.5 mini instruct and openassistant-guanaco dataset
 In this example a network is trained on multiple GPUs with the help of FSDP (Fully Sharded Data Parallel). This approach allows to train networks that are too large to fit into the memory of a single GPU.
 If we want to use multiple GPUs, we need to write the code to a file and submit the job to the SLURM scheduler, because the JupyterHub that we are using today does not have access to any GPU. This example uses two GPUs on one node, but could be extended simply by adjusting the number of GPUs and nodes in the SLURM script.
 %% Cell type:markdown id:d7d9ee84-e29b-4c05-b124-50e735033760 tags:
 #### First, we write the python code to a file:
 %% Cell type:code id:565c4533-5104-4a7c-a688-8b6acb72e17d tags:
 ``` python
 %%writefile phi3_guanaco_fsdp.py
 # Import libraries
 import torch
 from accelerate import PartialState
 from datasets import load_dataset
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer, SFTConfig
 import pynvml
 def print_gpu_utilization():
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    memory_used = []
    for device_index in range(device_count):
        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
        device_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        memory_used.append(device_info.used/1024**3)
    print('Memory occupied on GPUs: ' + ' + '.join([f'{mem:.1f}' for mem in memory_used]) + ' GB.')
 # Choose a model and load tokenizer and model (using 4bit quantization):
 model_name = '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/models/microsoft--phi-3.5-mini-instruct'
 # model_name = 'microsoft/Phi-3.5-mini-instruct'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.padding_side = 'right'
 # For multi-GPU training, find out how many GPUs there are and which one we should use:
 ps = PartialState()
 num_processes = ps.num_processes
 process_index = ps.process_index
 local_process_index = ps.local_process_index
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_storage=torch.bfloat16,  # Added for FSDP
    ),
    # device_map={'':local_process_index},  # Removed for FSDP
    attn_implementation='eager',  # 'eager', 'sdpa', or "flash_attention_2"
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
 )
 # Load the guanaco dataset
 guanaco_train = load_dataset('/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', split='train')
 guanaco_test = load_dataset('/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', split='test')
 # guanaco_train = load_dataset('timdettmers/openassistant-guanaco', split='train')
 # guanaco_test = load_dataset('timdettmers/openassistant-guanaco', split='test')
 guanaco_train = guanaco_train.map(lambda entry: {
    'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
    'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
 })
 guanaco_test = guanaco_test.map(lambda entry: {
    'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
    'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
 })
 guanaco_train = guanaco_train.map(lambda entry: {'messages': [
    {'role': 'user', 'content': entry['question1']},
    {'role': 'assistant', 'content': entry['answer1']}
 ]})
 guanaco_test = guanaco_test.map(lambda entry: {'messages': [
    {'role': 'user', 'content': entry['question1']},
    {'role': 'assistant', 'content': entry['answer1']}
 ]})
 model.config.use_cache = False  # KV cache can only speed up inference, but we are doing training.
 # Add low-rank adapters (LORA) to the model:
 peft_config = LoraConfig(
    task_type='CAUSAL_LM',
    r=16,
    lora_alpha=32,  # thumb rule: lora_alpha should be 2*r
    lora_dropout=0.05,
    bias='none',
    target_modules='all-linear',
 )
 model = get_peft_model(model, peft_config)
 training_arguments = SFTConfig(
    output_dir='output/phi-3.5-mini-instruct-guanaco-ddp',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
        # e.g. Mistral 7B with PEFT using bitsandbytes:
        # - enabled: 11 GB GPU RAM and 8 samples/second
        # - disabled: 40 GB GPU RAM and 12 samples/second
    gradient_checkpointing_kwargs={'use_reentrant': False},  # Use newer implementation that will become the default.
    # We don't need the following two lines for FSDP (compared to DDP):
    # ddp_find_unused_parameters=False,  # Set to False when using gradient checkpointing to suppress warning message.
    # log_level_replica='error',  # Disable warnings in all but the first process.
    optim='adamw_torch',
    learning_rate=2e-4,  # QLoRA suggestions: 2e-4 for 7B or 13B, 1e-4 for 33B or 65B
    logging_strategy='no',
    # logging_strategy='steps',  # 'no', 'epoch' or 'steps'
    # logging_steps=10,
    save_strategy='no',  # 'no', 'epoch' or 'steps'
    # save_steps=2000,
    # num_train_epochs=5,
    max_steps=100,
    bf16=True,  # mixed precision training
    report_to='none',  # disable wandb
    max_seq_length=1024,
 )
 def formatting_func(entry):
    return tokenizer.apply_chat_template(entry['messages'], tokenize=False)
 trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=guanaco_train,
    eval_dataset=guanaco_test,
    processing_class=tokenizer,
    formatting_func=formatting_func,
 )
 if process_index == 0:  # Only print in first process.
    if hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()
 train_result = trainer.train()
 if process_index == 0:
    print("Training result:")
    print(train_result)
 # Print memory usage once per node:
 if local_process_index == 0:
    print_gpu_utilization()
 # # Save model in first process only:
 # if process_index == 0:
 #     if trainer.is_fsdp_enabled:
 #         trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
 #     trainer.save_model()
 ```
 %% Output
    Overwriting phi3_guanaco_fsdp.py
 %% Cell type:markdown id:d64e5aa1-7732-481f-a8db-451360ba6d74 tags:
 #### Next, we write a file with the configuration for FSDP:
 %% Cell type:code id:a7e80b63-c5f8-4166-b0dc-4d48c9b4c9ac tags:
 ``` python
 %%writefile fsdp_config.yml
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: FSDP
 downcast_bf16: 'no'
 fsdp_config:
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: false
  fsdp_offload_params: false
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_use_orig_params: false
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 1
 rdzv_backend: c10d
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
 ```
 %% Output
    Overwriting fsdp_config.yml
 %% Cell type:markdown id:a1769457-c82f-4954-89a5-7b3b47ed72cc tags:
 #### Finally, we write the SLURM script:
 %% Cell type:code id:73291081-25e7-4578-a944-716d4e29d74b tags:
 ``` python
 %%writefile run_phi3_guanaco_fsdp.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
 # #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-#SBATCH --reservation=s_tra_EUD20b
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=1
 #SBATCH --gpus-per-task=2  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=240GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=16  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Set environment variables for communication between nodes:
 export MASTER_PORT=$(shuf -i 20000-30000 -n 1)  # Choose a random port
 export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
 # Set launcher and launcher arguments:
 export LAUNCHER="accelerate launch \
    --num_machines $SLURM_NNODES \
    --num_processes $((SLURM_NNODES * SLURM_GPUS_ON_NODE)) \
    --num_cpu_threads_per_process 8 \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --machine_rank \$SLURM_PROCID \
    --config_file \"fsdp_config.yml\" \
    "
 # Set training script that will be executed:
 export PROGRAM="phi3_guanaco_fsdp.py"
 # Run:
 time srun bash -c "$LAUNCHER $PROGRAM"
 ```
 %% Output
    Overwriting run_phi3_guanaco_fsdp.slurm
 %% Cell type:markdown id:4786f174-8231-4e1e-ae39-bff66ffccddc tags:
 #### We can now execute the SLURM script and, once the job ran, look at the output:
 %% Cell type:code id:8e8cc6fe-ec18-4856-b99a-e1e2f4f5ca86 tags:
 ``` python
 !sbatch run_phi3_guanaco_fsdp.slurm
 ```
 %% Output
    Submitted batch job 12957529
 %% Cell type:code id:7886c3e1-da04-49b9-bf9d-806083239ad9 tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12957529 boost_usr run_phi3 mpfister  R       1:38      1 lrdn1681
              12952669 boost_usr jupyterl mpfister  R    3:54:53      1 lrdn3456
 %% Cell type:code id:55f36afe-483c-4f44-aefc-be4a2304ab8f tags:
 ``` python
 !cat slurm-12957529.out
 ```
 %% Output
    + date
    Mon Feb 24 20:03:18 CET 2025
    + hostname
    lrdn1681.leonardo.local
    + nvidia-smi
    Mon Feb 24 20:03:18 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:1D:00.0 Off |                    0 |
    | N/A   43C    P0               64W / 478W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    |   1  NVIDIA A100-SXM-64GB            On | 00000000:56:00.0 Off |                    0 |
    | N/A   43C    P0               64W / 481W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    ++ shuf -i 20000-30000 -n 1
    + export MASTER_PORT=21982
    + MASTER_PORT=21982
    ++ scontrol show hostnames lrdn1681
    ++ head -n 1
    + export MASTER_ADDR=lrdn1681
    + MASTER_ADDR=lrdn1681
    + export OMP_NUM_THREADS=16
    + OMP_NUM_THREADS=16
    + export 'LAUNCHER=accelerate launch     --num_machines 1     --num_processes 2     --num_cpu_threads_per_process 8     --main_process_ip lrdn1681     --main_process_port 21982     --machine_rank $SLURM_PROCID     --config_file "fsdp_config.yml"     '
    + LAUNCHER='accelerate launch     --num_machines 1     --num_processes 2     --num_cpu_threads_per_process 8     --main_process_ip lrdn1681     --main_process_port 21982     --machine_rank $SLURM_PROCID     --config_file "fsdp_config.yml"     '
    + export PROGRAM=phi3_guanaco_fsdp.py
    + PROGRAM=phi3_guanaco_fsdp.py
    + srun bash -c 'accelerate launch     --num_machines 1     --num_processes 2     --num_cpu_threads_per_process 8     --main_process_ip lrdn1681     --main_process_port 21982     --machine_rank $SLURM_PROCID     --config_file "fsdp_config.yml"      phi3_guanaco_fsdp.py'
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
    Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.90s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.59s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    [rank1]:[W224 20:03:53.659277942 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    [rank0]:[W224 20:03:53.862336206 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-24 20:03:53,796] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    [2025-02-24 20:03:53,796] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
      0%|          | 0/100 [00:00<?, ?it/s]You are not running the flash-attention implementation, expect numerical differences.
    You are not running the flash-attention implementation, expect numerical differences.
     88%|████████▊ | 88{'train_runtime': 199.6739, 'train_samples_per_second': 8.013, 'train_steps_per_second': 0.501, 'train_loss': 1.1833365631103516, 'epoch': 0.16}
    100%|██████████| 100/100 [03:19<00:00,  2.00s/it]
    Training result:
    TrainOutput(global_step=100, training_loss=1.1833365631103516, metrics={'train_runtime': 199.6739, 'train_samples_per_second': 8.013, 'train_steps_per_second': 0.501, 'total_flos': 2.709130293595341e+16, 'train_loss': 1.1833365631103516, 'epoch': 0.16233766233766234})
    Memory occupied on GPUs: 13.6 + 16.0 GB.
    real	3m59.518s
    user	0m0.272s
    sys	0m0.009s
 %% Cell type:markdown id:ad61e8d3-5625-4a78-8a08-32ea4158a581 tags:
 #### Finally, we can clean up and delete the files that we just created:
 %% Cell type:code id:6a52fe7c-9bfe-45d0-84ed-9287c9c84f0c tags:
 ``` python
 !rm fsdp_config.yml phi3_guanaco_fsdp.py run_phi3_guanaco_fsdp.slurm slurm-*.out
 ```
 %% Cell type:code id:917d8108-1e84-45c2-8560-af8958deb599 tags:
 ``` python
 ```

--- a/D2_05_Liger_DDP_example.ipynb
+++ b/D2_05_Liger_DDP_example.ipynb
@@ -239,8 +239,8 @@
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "#SBATCH --reservation=s_tra_EUD20b\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",

 %% Cell type:markdown id:d0b1f163-108f-4587-8b0d-0246158ee528 tags:
 # Liger kernel example with DDP, Mistral 7B instruct and openassistant-guanaco dataset
 In the first course, we demonstrated a speed-up by using the *unsloth* library, which contains optimized GPU kernels created by manually deriving all compute heavy math steps. *Unsloth* only works for single GPU training though. Another library that also offers optimised GPU kernels is *liger*, which has the advantage that it can also be used for multi-GPU training.
 In this notebook, we demonstrate finetuning of *Mistral 7B instruct* on two GPUs using DDP and *liger kernels*. The same script is run twice, once with without and once with *liger kernels* in order to compare speed and memory usage.
 %% Cell type:markdown id:d7d9ee84-e29b-4c05-b124-50e735033760 tags:
 #### First, we write the python code to a file:
 %% Cell type:code id:565c4533-5104-4a7c-a688-8b6acb72e17d tags:
 ``` python
 %%writefile phi3_guanaco_ddp_liger.py
 # Import libraries
 import torch
 from accelerate import PartialState
 from datasets import load_dataset
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTTrainer, SFTConfig
 import pynvml
 import sys
 if len(sys.argv) >= 2 and sys.argv[1] == '--enable-liger':
    enable_liger = True
    print('Using liger kernels.')
 else:
    enable_liger = False
    print('Not using liger kernels.')
 # Import liger kernels and apply automatic monkey-patching to models:
 # from liger_kernel.transformers import apply_liger_kernel_to_phi3
 # apply_liger_kernel_to_phi3()
 # https://github.com/linkedin/Liger-Kernel?tab=readme-ov-file#getting-started
 from liger_kernel.transformers import AutoLigerKernelForCausalLM
 def print_gpu_utilization():
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    memory_used = []
    for device_index in range(device_count):
        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
        device_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        memory_used.append(device_info.used/1024**3)
    print('Memory occupied on GPUs: ' + ' + '.join([f'{mem:.1f}' for mem in memory_used]) + ' GB.')
 # Choose a model and load tokenizer and model (using 4bit quantization):
 model_name = '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/models/mistralai--Mistral-7B-Instruct-v0.3'
 # model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.padding_side = 'right'
 tokenizer.pad_token = tokenizer.eos_token  # Needed for Mistral 7B
 # For multi-GPU training, find out how many GPUs there are and which one we should use:
 ps = PartialState()
 num_processes = ps.num_processes
 process_index = ps.process_index
 local_process_index = ps.local_process_index
 if enable_liger:
    model = AutoLigerKernelForCausalLM.from_pretrained(
        model_name,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16,
        ),
        device_map={'':local_process_index},  # Changed for DDP
        attn_implementation='eager',  # 'eager', 'sdpa', or "flash_attention_2"
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )
 else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16,
        ),
        device_map={'':local_process_index},  # Changed for DDP
        attn_implementation='eager',  # 'eager', 'sdpa', or "flash_attention_2"
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )
 # Load the guanaco dataset
 guanaco_train = load_dataset('/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', split='train')
 guanaco_test = load_dataset('/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', split='test')
 # guanaco_train = load_dataset('timdettmers/openassistant-guanaco', split='train')
 # guanaco_test = load_dataset('timdettmers/openassistant-guanaco', split='test')
 guanaco_train = guanaco_train.map(lambda entry: {
    'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
    'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
 })
 guanaco_test = guanaco_test.map(lambda entry: {
    'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
    'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
 })
 guanaco_train = guanaco_train.map(lambda entry: {'messages': [
    {'role': 'user', 'content': entry['question1']},
    {'role': 'assistant', 'content': entry['answer1']}
 ]})
 guanaco_test = guanaco_test.map(lambda entry: {'messages': [
    {'role': 'user', 'content': entry['question1']},
    {'role': 'assistant', 'content': entry['answer1']}
 ]})
 model.config.use_cache = False  # KV cache can only speed up inference, but we are doing training.
 # Add low-rank adapters (LORA) to the model:
 peft_config = LoraConfig(
    task_type='CAUSAL_LM',
    r=16,
    lora_alpha=32,  # thumb rule: lora_alpha should be 2*r
    lora_dropout=0.05,
    bias='none',
    target_modules='all-linear',
 )
 model = get_peft_model(model, peft_config)
 training_arguments = SFTConfig(
    output_dir='output/phi-3.5-mini-instruct-guanaco-ddp-liger',
    per_device_train_batch_size=32//num_processes,  # Adjust per-device batch size for DDP
    gradient_accumulation_steps=1,
    gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
        # e.g. Mistral 7B with PEFT using bitsandbytes:
        # - enabled: 11 GB GPU RAM and 8 samples/second
        # - disabled: 40 GB GPU RAM and 12 samples/second
    gradient_checkpointing_kwargs={'use_reentrant': False},  # Use newer implementation that will become the default.
    ddp_find_unused_parameters=False,  # Set to False when using gradient checkpointing to suppress warning message.
    log_level_replica='error',  # Disable warnings in all but the first process.
    optim='adamw_torch',
    learning_rate=2e-4,  # QLoRA suggestions: 2e-4 for 7B or 13B, 1e-4 for 33B or 65B
    logging_strategy='no',
    # logging_strategy='steps',  # 'no', 'epoch' or 'steps'
    # logging_steps=10,
    save_strategy='no',  # 'no', 'epoch' or 'steps'
    # save_steps=2000,
    # num_train_epochs=5,
    max_steps=20,
    bf16=True,  # mixed precision training
    report_to='none',  # disable wandb
    max_seq_length=1024,
 )
 def formatting_func(entry):
    return tokenizer.apply_chat_template(entry['messages'], tokenize=False)
 trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=guanaco_train,
    eval_dataset=guanaco_test,
    processing_class=tokenizer,
    formatting_func=formatting_func,
 )
 if process_index == 0:  # Only print in first process.
    if hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()
 # eval_result = trainer.evaluate()
 # if process_index == 0:
 #     print("Evaluation on test dataset before finetuning:")
 #     print(eval_result)
 train_result = trainer.train()
 if process_index == 0:
    print("Training result:")
    print(train_result)
 # eval_result = trainer.evaluate()
 # if process_index == 0:
 #     print("Evaluation on test dataset after finetuning:")
 #     print(eval_result)
 # Print memory usage once per node:
 if local_process_index == 0:
    print_gpu_utilization()
 # # Save model in first process only:
 # if process_index == 0:
 #     trainer.save_model()
 ```
 %% Output
    Overwriting phi3_guanaco_ddp_liger.py
 %% Cell type:markdown id:c9519e17-ec88-4b56-ab1d-1fbc4cb15612 tags:
 #### Next, we write the SLURM script and submit the script to the scheduler again:
 %% Cell type:code id:7c012ab2-b6ad-4078-aa60-a37ada2c8012 tags:
 ``` python
 %%writefile run_phi3_guanaco_ddp_liger.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
 # #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-#SBATCH --reservation=s_tra_EUD20b
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=1
 #SBATCH --gpus-per-task=2  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=240GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=16  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:30:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Set environment variables for communication between nodes:
 export MASTER_PORT=$(shuf -i 20000-30000 -n 1)  # Choose a random port
 export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
 # Set launcher and launcher arguments:
 export LAUNCHER="torchrun \
    --nnodes=$SLURM_JOB_NUM_NODES \
    --nproc_per_node=$SLURM_GPUS_ON_NODE \
    --rdzv_id=$SLURM_JOB_ID \
    --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
    --rdzv_backend=c10d"
 # Set training script that will be executed:
 export PROGRAM="phi3_guanaco_ddp_liger.py"
 # Run:
 time srun bash -c "$LAUNCHER $PROGRAM"
 time srun bash -c "$LAUNCHER $PROGRAM --enable-liger"
 ```
 %% Output
    Overwriting run_phi3_guanaco_ddp_liger.slurm
 %% Cell type:code id:c7a766c6-f7ec-49f6-a557-aa0b5d8ec357 tags:
 ``` python
 !sbatch run_phi3_guanaco_ddp_liger.slurm
 ```
 %% Output
    Submitted batch job 12958288
 %% Cell type:code id:37fd35dd-3ba0-4627-a53c-ca6542de7b2f tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12958288 boost_usr run_phi3 mpfister CF       0:02      1 lrdn1261
              12952669 boost_usr jupyterl mpfister  R    5:03:54      1 lrdn3456
 %% Cell type:code id:653a5bcf-4827-4f02-939a-72410c7a12f3 tags:
 ``` python
 !cat slurm-12958288.out
 ```
 %% Output
    + date
    Mon Feb 24 21:13:54 CET 2025
    + hostname
    lrdn1261.leonardo.local
    + nvidia-smi
    Mon Feb 24 21:13:54 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:56:00.0 Off |                    0 |
    | N/A   43C    P0               67W / 477W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    |   1  NVIDIA A100-SXM-64GB            On | 00000000:C8:00.0 Off |                    0 |
    | N/A   43C    P0               62W / 454W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    ++ shuf -i 20000-30000 -n 1
    + export MASTER_PORT=26849
    + MASTER_PORT=26849
    ++ head -n 1
    ++ scontrol show hostnames lrdn1261
    + export MASTER_ADDR=lrdn1261
    + MASTER_ADDR=lrdn1261
    + export OMP_NUM_THREADS=16
    + OMP_NUM_THREADS=16
    + export 'LAUNCHER=torchrun     --nnodes=1     --nproc_per_node=2     --rdzv_id=12958288     --rdzv_endpoint=lrdn1261:26849     --rdzv_backend=c10d'
    + LAUNCHER='torchrun     --nnodes=1     --nproc_per_node=2     --rdzv_id=12958288     --rdzv_endpoint=lrdn1261:26849     --rdzv_backend=c10d'
    + export PROGRAM=phi3_guanaco_ddp_liger.py
    + PROGRAM=phi3_guanaco_ddp_liger.py
    + srun bash -c 'torchrun     --nnodes=1     --nproc_per_node=2     --rdzv_id=12958288     --rdzv_endpoint=lrdn1261:26849     --rdzv_backend=c10d phi3_guanaco_ddp_liger.py'
    Not using liger kernels.
    Not using liger kernels.
    Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.33s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.79s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    [rank1]:[W224 21:14:21.424767353 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    [rank0]:[W224 21:14:21.683591198 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-24 21:14:21,910] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    [2025-02-24 21:14:21,940] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    trainable params: 41,943,040 || all params: 7,289,966,592 || trainable%: 0.5754
     90%|███�{'train_runtime': 116.7785, 'train_samples_per_second': 5.48, 'train_steps_per_second': 0.171, 'train_loss': 1.2621638298034668, 'epoch': 0.06}
    100%|██████████| 20/20 [01:56<00:00,  5.84s/it]
    Training result:
    TrainOutput(global_step=20, training_loss=1.2621638298034668, metrics={'train_runtime': 116.7785, 'train_samples_per_second': 5.48, 'train_steps_per_second': 0.171, 'total_flos': 2.26549861187584e+16, 'train_loss': 1.2621638298034668, 'epoch': 0.06493506493506493})
    Memory occupied on GPUs: 36.0 + 34.2 GB.
    real	2m27.358s
    user	0m0.271s
    sys	0m0.006s
    + srun bash -c 'torchrun     --nnodes=1     --nproc_per_node=2     --rdzv_id=12958288     --rdzv_endpoint=lrdn1261:26849     --rdzv_backend=c10d phi3_guanaco_ddp_liger.py --enable-liger'
    Using liger kernels.
    Using liger kernels.
    Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.63s/it]
    Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.27s/it]
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    Repo card metadata block was not found. Setting CardData to empty.
    [rank1]:[W224 21:16:47.511289577 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    [rank0]:[W224 21:16:48.100371256 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id.
    Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    [2025-02-24 21:16:48,329] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    [2025-02-24 21:16:48,362] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)
    trainable params: 41,943,040 || all params: 7,289,966,592 || trainable%: 0.5754
     90%|███�{'train_runtime': 105.2095, 'train_samples_per_second': 6.083, 'train_steps_per_second': 0.19, 'train_loss': 1.2602069854736329, 'epoch': 0.06}
    100%|██████████| 20/20 [01:45<00:00,  5.26s/it]
    Training result:
    TrainOutput(global_step=20, training_loss=1.2602069854736329, metrics={'train_runtime': 105.2095, 'train_samples_per_second': 6.083, 'train_steps_per_second': 0.19, 'total_flos': 2.26549861187584e+16, 'train_loss': 1.2602069854736329, 'epoch': 0.06493506493506493})
    Memory occupied on GPUs: 30.9 + 32.1 GB.
    real	2m14.605s
    user	0m0.268s
    sys	0m0.008s
 %% Cell type:code id:56dd8745-c6fc-4b26-b5f1-34c245e840d3 tags:
 ``` python
 ```
 %% Cell type:markdown id:e102f7d9-ca9f-486e-937e-c3ee3a09fc40 tags:
 #### Finally, we can clean up and delete the files that we just created:
 %% Cell type:code id:6a52fe7c-9bfe-45d0-84ed-9287c9c84f0c tags:
 ``` python
 !rm phi3_guanaco_ddp_liger.py run_phi3_guanaco_ddp_liger.slurm slurm-*.out
 ```
 %% Cell type:code id:e15b7d0b-924a-46f1-a23f-42ee5964dfec tags:
 ``` python
 ```

--- a/D2_07_Gradio_example.ipynb
+++ b/D2_07_Gradio_example.ipynb
@@ -102,8 +102,8 @@
    "\n",
    "#SBATCH --partition=boost_usr_prod\n",
    "# #SBATCH --qos=boost_qos_dbg\n",
-    "#SBATCH --account=EUHPC_D20_063\n",
+    "#SBATCH --account=tra25_castiel2\n",
-    "#SBATCH --reservation=s_tra_EUD20b\n",
+    "#SBATCH --reservation=s_tra_castiel2\n",
    "\n",
    "## Specify resources:\n",
    "## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores\n",

 %% Cell type:markdown id:fc68bcd6-bca8-4376-b231-0c8c84c532e4 tags:
 ## Gradio
 %% Cell type:markdown id:c96197c4-9bdf-4895-a209-f97a30660b66 tags:
 [Gradio](https://www.gradio.app) can enable simple web interfaces to your software. In this example, we are using Gradio to get a simple chat interface to a large language model.
 %% Cell type:code id:e8e10fab-fc0b-41c0-a794-8f73ae823ff8 tags:
 ``` python
 %%writefile gradio_example.py
 # Import necessary libraries
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
 import gradio as gr
 import os
 import random
 import socket
 import sys
 ip = socket.gethostbyname(socket.gethostname())
 hostname = socket.gethostname().split('.')[0]
 port = random.randint(10000, 50000)
 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 # ! Change the name trainee user name to the name in your personal URL: !
 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 trainee_user = 'trainee01'
 print('Open the following URL in your webbrowser:')
 print(f'https://training.hpc.webredirect.org/{trainee_user}/proxy/absolute/{hostname}:{port}/')
 print('')
 sys.stdout.flush()
 model_name = '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/models/microsoft--phi-3.5-mini-instruct'
 # model_name = 'microsoft/Phi-3.5-mini-instruct'
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='cuda',
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
 )
 pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)
 def get_answer(question, history=[]):
    history.append(
        {'role': 'user', 'content': question}
    )
    result = pipe(history, max_new_tokens=500, return_full_text=False)
    return result[0]['generated_text'].strip()
    # return question
 chat_interface = gr.ChatInterface(get_answer, type='messages')
 chat_interface.launch(share=False, server_name=ip, server_port=port, root_path=f'/{trainee_user}/proxy/absolute/{hostname}:{port}')
 ```
 %% Output
    Overwriting gradio_example.py
 %% Cell type:code id:dc0e4a77-b509-4e9a-8b44-d735c8539ea9 tags:
 ``` python
 %%writefile run_gradio_example.slurm
 #!/bin/bash
 #SBATCH --partition=boost_usr_prod
 # #SBATCH --qos=boost_qos_dbg
-#SBATCH --account=EUHPC_D20_063
+#SBATCH --account=tra25_castiel2
-#SBATCH --reservation=s_tra_EUD20b
+#SBATCH --reservation=s_tra_castiel2
 ## Specify resources:
 ## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
 ## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
 #SBATCH --nodes=1
 #SBATCH --gpus-per-task=1  # up to 4 on Leonardo
 #SBATCH --ntasks-per-node=1  # always 1
 #SBATCH --mem=120GB  # should be 120GB * gpus-per-task on Leonardo
 #SBATCH --cpus-per-task=8  # should be 8 * gpus-per-task on Leonardo
 #SBATCH --time=0:10:00
 # Load conda:
 # module purge
 # module load miniconda3
 # eval "$(conda shell.bash hook)"
 # conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24
 # Include commands in output:
 set -x
 # Print current time and date:
 date
 # Print host name:
 hostname
 # List available GPUs:
 nvidia-smi
 # Run AI scripts:
 python3 gradio_example.py
 ```
 %% Output
    Overwriting run_gradio_example.slurm
 %% Cell type:markdown id:7362f8e6-6855-491d-86dd-46de21d4d753 tags:
 Now submit the SLURM job:
 %% Cell type:code id:9eaf7a89-88ee-45f7-9fd0-fb5820db25c9 tags:
 ``` python
 !sbatch run_gradio_example.slurm
 ```
 %% Output
    Submitted batch job 12959420
 %% Cell type:markdown id:87f4ddfe-44de-4b68-aab8-00f08a610e8c tags:
 Execute `squeue` to see, if your job is already running:
 %% Cell type:code id:3e10a89e-425e-4570-993b-c595869183e9 tags:
 ``` python
 !squeue --me
 ```
 %% Output
                 JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
              12959420 boost_usr run_grad mpfister PD       0:00      1 (None)
              12952669 boost_usr jupyterl mpfister  R    5:29:26      1 lrdn3456
 %% Cell type:markdown id:072291e5-fea9-4821-9958-2da0b0ada7b5 tags:
 Once your job is running, look at the output of the job using the following command (replace the number with the JOBID from above):
 %% Cell type:code id:0d325f17-2e5c-4b26-82d4-98e5b4f52d26 tags:
 ``` python
 !cat slurm-12959420.out
 ```
 %% Output
    + date
    Mon Feb 24 21:39:29 CET 2025
    + hostname
    lrdn0151.leonardo.local
    + nvidia-smi
    Mon Feb 24 21:39:29 2025
    +---------------------------------------------------------------------------------------+
    | NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
    |-----------------------------------------+----------------------+----------------------+
    | GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
    | Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
    |                                         |                      |               MIG M. |
    |=========================================+======================+======================|
    |   0  NVIDIA A100-SXM-64GB            On | 00000000:56:00.0 Off |                    0 |
    | N/A   43C    P0               63W / 477W|      0MiB / 65536MiB |      0%      Default |
    |                                         |                      |             Disabled |
    +-----------------------------------------+----------------------+----------------------+
    +---------------------------------------------------------------------------------------+
    | Processes:                                                                            |
    |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
    |        ID   ID                                                             Usage      |
    |=======================================================================================|
    |  No running processes found                                                           |
    +---------------------------------------------------------------------------------------+
    + python3 gradio_example.py
    Open the following URL in your webbrowser:
    https://training.hpc.webredirect.org/trainee01/proxy/absolute/lrdn0151:17228/
    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
 %% Cell type:markdown id:26aaff63-60eb-4cda-bbac-fb214b0a588a tags:
 Finally, when you are finished, please cancel the SLURM job to free the resources:
 %% Cell type:code id:67bd34e4-fa5a-426d-b51b-218269740ed3 tags:
 ``` python
 !scancel 12959420
 ```
 %% Cell type:markdown id:63c020f6-e785-41f6-bb49-58d56994389c tags:
 If you want to, you can also delete the files that we create above:
 %% Cell type:code id:01cc3e8b-6303-46d4-9051-f3551e527e31 tags:
 ``` python
 !rm gradio_example.py run_gradio_example.slurm slurm-*.out
 ```
 %% Cell type:code id:2b049395-b496-4412-abb7-1c5a7f592cce tags:
 ``` python
 ```