Added notebook TensorFlow distributed for multi-node, multi-GPU setup.

784522c4 · Harrison, Simeon · e2d342d4 · 784522c4
Commit 784522c4 authored 9 months ago by Harrison, Simeon
--- a/18_TF_distributed.ipynb
+++ b/18_TF_distributed.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ./tf_distr_slurm.sh\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile ./tf_distr_slurm.sh\n",
+    "#!/bin/bash\n",
+    "\n",
+    "#SBATCH --job-name=tf_distr_example\n",
+    "#SBATCH --account=p70824 # training account, please uncomment for training\n",
+    "#SBATCH --nodes=2                    # Number of nodes\n",
+    "#SBATCH --ntasks-per-node=1          # Number of tasks per node\n",
+    "#SBATCH --cpus-per-task=256          # Number of CPU cores per task (including hyperthreading if needed)\n",
+    "#SBATCH --partition=zen3_0512_a100x2\n",
+    "#SBATCH --qos=zen3_0512_a100x2 # qos for training\n",
+    "#SBATCH --gres=gpu:2                 # Number of GPUs per node\n",
+    "#SBATCH --output=./output/%x-%j.out  # Output file\n",
+    "#SBATCH --time=00:10:00\n",
+    "\n",
+    "######################\n",
+    "### Set Environment ###\n",
+    "######################\n",
+    "module load miniconda3\n",
+    "eval \"$(conda shell.bash hook)\"\n",
+    "source /opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-huggingface-v2/modules  # Activate the conda environment\n",
+    "#source /opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-llm-training-v3\n",
+    "\n",
+    "######################\n",
+    "#### Set Network #####\n",
+    "######################\n",
+    "# Get the IP address of the master node (head node)\n",
+    "nodes=$(scontrol show hostnames \"$SLURM_JOB_NODELIST\")\n",
+    "nodes_array=($nodes)\n",
+    "export NODE_0=${nodes_array[0]}\n",
+    "export NODE_1=${nodes_array[1]}\n",
+    "export MASTER_PORT=29500\n",
+    "\n",
+    "NUM_PROCESSES=$(( SLURM_NNODES * SLURM_GPUS_ON_NODE ))\n",
+    "\n",
+    "######################\n",
+    "### Launch Training ###\n",
+    "######################\n",
+    "srun python3 tf_distr.py\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ./tf_distr.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile ./tf_distr.py\n",
+    "\n",
+    "import os\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import tensorflow as tf\n",
+    "from sklearn.datasets import fetch_openml\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import ConfusionMatrixDisplay\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Dynamically set TF_CONFIG for distributed TensorFlow\n",
+    "def set_tf_config():\n",
+    "    # Retrieve master and worker nodes from environment variables\n",
+    "    node_0 = os.environ['NODE_0']\n",
+    "    node_1 = os.environ['NODE_1']\n",
+    "    task_id = int(os.environ['SLURM_PROCID'])  # SLURM task ID determines worker index\n",
+    "\n",
+    "    # Create a list of workers\n",
+    "    worker_hosts = [f\"{node_0}:29500\", f\"{node_1}:29500\"]\n",
+    "\n",
+    "    # Construct TF_CONFIG\n",
+    "    tf_config = {\n",
+    "        \"cluster\": {\"worker\": worker_hosts},\n",
+    "        \"task\": {\"type\": \"worker\", \"index\": task_id}\n",
+    "    }\n",
+    "    os.environ['TF_CONFIG'] = json.dumps(tf_config)\n",
+    "    print(\"TF_CONFIG set to:\", json.dumps(tf_config, indent=4))\n",
+    "\n",
+    "# Call the TF_CONFIG setup function\n",
+    "set_tf_config()\n",
+    "\n",
+    "# Set up distributed strategy\n",
+    "strategy = tf.distribute.MultiWorkerMirroredStrategy()\n",
+    "\n",
+    "# Load and preprocess the MNIST dataset\n",
+    "mnist = fetch_openml('mnist_784', as_frame=False)\n",
+    "X, y = mnist.data, mnist.target\n",
+    "\n",
+    "# Train-test split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)\n",
+    "X_train = X_train.reshape(-1, 28, 28) / 255.0\n",
+    "X_test = X_test.reshape(-1, 28, 28) / 255.0\n",
+    "y_train = np.array(y_train, dtype=\"int32\")\n",
+    "y_test = np.array(y_test, dtype=\"int32\")\n",
+    "\n",
+    "# Create a distributed dataset\n",
+    "batch_size = 64\n",
+    "train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(10000).batch(batch_size)\n",
+    "test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)\n",
+    "\n",
+    "# Define the model within the strategy's scope\n",
+    "with strategy.scope():\n",
+    "    model = tf.keras.models.Sequential([\n",
+    "        tf.keras.layers.Flatten(input_shape=(28, 28)),\n",
+    "        tf.keras.layers.Dense(128, activation='relu'),\n",
+    "        tf.keras.layers.Dropout(0.2),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\")\n",
+    "    ])\n",
+    "\n",
+    "    model.compile(\n",
+    "        optimizer=tf.keras.optimizers.Adam(),\n",
+    "        loss=tf.keras.losses.SparseCategoricalCrossentropy(),\n",
+    "        metrics=['accuracy']\n",
+    "    )\n",
+    "\n",
+    "# Train the model\n",
+    "model.fit(train_dataset, epochs=20, validation_data=test_dataset)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "y_pred = model.predict(X_test, batch_size=batch_size)\n",
+    "y_preds = tf.argmax(y_pred, axis=1)\n",
+    "\n",
+    "# Print predictions and ground truth\n",
+    "print(\"Predictions:\", y_preds.numpy())\n",
+    "print(\"Ground Truth:\", y_test)\n",
+    "\n",
+    "# Plot confusion matrix\n",
+    "ConfusionMatrixDisplay.from_predictions(y_test, y_preds)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  },
+  "nav_menu": {},
+  "toc": {
+   "navigate_menu": true,
+   "number_sections": true,
+   "sideBar": true,
+   "threshold": 6,
+   "toc_cell": false,
+   "toc_section_display": "block",
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+``` python
+%%writefile ./tf_distr_slurm.sh
+#!/bin/bash
+#SBATCH --job-name=tf_distr_example
+#SBATCH --account=p70824 # training account, please uncomment for training
+#SBATCH --nodes=2                    # Number of nodes
+#SBATCH --ntasks-per-node=1          # Number of tasks per node
+#SBATCH --cpus-per-task=256          # Number of CPU cores per task (including hyperthreading if needed)
+#SBATCH --partition=zen3_0512_a100x2
+#SBATCH --qos=zen3_0512_a100x2 # qos for training
+#SBATCH --gres=gpu:2                 # Number of GPUs per node
+#SBATCH --output=./output/%x-%j.out  # Output file
+#SBATCH --time=00:10:00
+######################
+### Set Environment ###
+######################
+module load miniconda3
+eval "$(conda shell.bash hook)"
+source /opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-huggingface-v2/modules  # Activate the conda environment
+#source /opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-llm-training-v3
+######################
+#### Set Network #####
+######################
+# Get the IP address of the master node (head node)
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+export NODE_0=${nodes_array[0]}
+export NODE_1=${nodes_array[1]}
+export MASTER_PORT=29500
+NUM_PROCESSES=$(( SLURM_NNODES * SLURM_GPUS_ON_NODE ))
+######################
+### Launch Training ###
+######################
+srun python3 tf_distr.py
+```
+%% Output
+    Overwriting ./tf_distr_slurm.sh
+%% Cell type:code id: tags:
+``` python
+%%writefile ./tf_distr.py
+import os
+import json
+import numpy as np
+import tensorflow as tf
+from sklearn.datasets import fetch_openml
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import ConfusionMatrixDisplay
+import matplotlib.pyplot as plt
+# Dynamically set TF_CONFIG for distributed TensorFlow
+def set_tf_config():
+    # Retrieve master and worker nodes from environment variables
+    node_0 = os.environ['NODE_0']
+    node_1 = os.environ['NODE_1']
+    task_id = int(os.environ['SLURM_PROCID'])  # SLURM task ID determines worker index
+    # Create a list of workers
+    worker_hosts = [f"{node_0}:29500", f"{node_1}:29500"]
+    # Construct TF_CONFIG
+    tf_config = {
+        "cluster": {"worker": worker_hosts},
+        "task": {"type": "worker", "index": task_id}
+    }
+    os.environ['TF_CONFIG'] = json.dumps(tf_config)
+    print("TF_CONFIG set to:", json.dumps(tf_config, indent=4))
+# Call the TF_CONFIG setup function
+set_tf_config()
+# Set up distributed strategy
+strategy = tf.distribute.MultiWorkerMirroredStrategy()
+# Load and preprocess the MNIST dataset
+mnist = fetch_openml('mnist_784', as_frame=False)
+X, y = mnist.data, mnist.target
+# Train-test split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
+X_train = X_train.reshape(-1, 28, 28) / 255.0
+X_test = X_test.reshape(-1, 28, 28) / 255.0
+y_train = np.array(y_train, dtype="int32")
+y_test = np.array(y_test, dtype="int32")
+# Create a distributed dataset
+batch_size = 64
+train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(10000).batch(batch_size)
+test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)
+# Define the model within the strategy's scope
+with strategy.scope():
+    model = tf.keras.models.Sequential([
+        tf.keras.layers.Flatten(input_shape=(28, 28)),
+        tf.keras.layers.Dense(128, activation='relu'),
+        tf.keras.layers.Dropout(0.2),
+        tf.keras.layers.Dense(10, activation="softmax")
+    ])
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(),
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
+        metrics=['accuracy']
+    )
+# Train the model
+model.fit(train_dataset, epochs=20, validation_data=test_dataset)
+# Evaluate the model
+y_pred = model.predict(X_test, batch_size=batch_size)
+y_preds = tf.argmax(y_pred, axis=1)
+# Print predictions and ground truth
+print("Predictions:", y_preds.numpy())
+print("Ground Truth:", y_test)
+# Plot confusion matrix
+ConfusionMatrixDisplay.from_predictions(y_test, y_preds)
+plt.show()
+```
+%% Output
+    Overwriting ./tf_distr.py
+%% Cell type:code id: tags:
+``` python
+```