Add Gradio example for VSC with GPU attached to jupyterlab

e636022a · Pfister, Martin · 9619f9e5 · e636022a
Commit e636022a authored 6 months ago by Pfister, Martin
--- a/D3_Gradio_example_VSC.ipynb
+++ b/D3_Gradio_example_VSC.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fc68bcd6-bca8-4376-b231-0c8c84c532e4",
+   "metadata": {},
+   "source": [
+    "## Gradio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c96197c4-9bdf-4895-a209-f97a30660b66",
+   "metadata": {},
+   "source": [
+    "[Gradio](https://www.gradio.app) can enable simple web interfaces to your software. In this example, we are using Gradio to get a simple chat interface to a large language model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "18b34bbb-3946-4e31-8c51-c50f66d327bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig\n",
+    "import gradio as gr\n",
+    "import os\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "744de243-d842-4fd8-a2d9-e0ebfb85f91e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use a random TCP port:\n",
+    "port = random.randint(10000, 50000)\n",
+    "# Get username\n",
+    "username = os.environ['USER']\n",
+    "# Construct URL:\n",
+    "relative_url = f'/user/{username}/proxy/absolute/{port}/'  # Needs to start with '/'\n",
+    "absolute_url = f'https://jupyterhub.vsc.ac.at{relative_url}'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "114aa1d0-c7c2-49f5-878b-de7f76a6eb9e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.\n",
+      "Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7e7fe196c066404d8a596a2eb4703854",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Device set to use cpu\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load tokenizer and model and create a pipeline that can be used for inference:\n",
+    "model_name = '/gpfs/data/fs70824/LLMs_models_datasets/models/microsoft--phi-3.5-mini-instruct'\n",
+    "# model_name = 'microsoft/Phi-3.5-mini-instruct'\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    device_map='cuda',\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    trust_remote_code=True,\n",
+    "    quantization_config=BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        bnb_4bit_quant_type=\"nf4\",\n",
+    "        bnb_4bit_compute_dtype=torch.bfloat16\n",
+    "    )\n",
+    ")\n",
+    "pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "96fefd3b-f679-4b3a-b0de-f519c24249bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare a function that takes chatbot questions and returns the answer from the LLM:\n",
+    "def get_answer(question, history=[]):\n",
+    "    history.append(\n",
+    "        {'role': 'user', 'content': question}\n",
+    "    )\n",
+    "    result = pipe(history, max_new_tokens=500, return_full_text=False)\n",
+    "    return result[0]['generated_text'].strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "53ae1640-5515-45d1-a8ff-494d72662c62",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "which: no node in (/opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-llm-training-v4/bin:/opt/sw/conda/miniconda3-24.1.2/condabin:/opt/sw/cuda-zen/spack-0.19.0/bin:/home/fs71550/mpfister/.local/bin:/home/fs71550/mpfister/bin:/usr/share/Modules/bin:/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/lpp/mmfs/bin:/opt/sw/slurm/x86_64/alma8.8/22-05-2-1/bin:/opt/sw/slurm/x86_64/alma8.8/22-05-2-1/sbin:/opt/sw/vsc_modules/modules-4.2.2/bin:/opt/sw/vsc4/VSC/x86_64/generic/bin:/opt/sw/tools:/opt/sw/conda/miniconda3/condabin:/opt/sw/conda/miniconda3/bin)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Running on local URL:  http://127.0.0.1:39953\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n",
+      "\n",
+      "Open the following URL in your webbrowser:\n",
+      "https://jupyterhub.vsc.ac.at/user/mpfister/proxy/absolute/39953/\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create a Gradio ChatInterface and launch it:\n",
+    "chat_interface = gr.ChatInterface(get_answer, type='messages')\n",
+    "chat_interface.launch(share=False, inline=False, server_name='127.0.0.1', server_port=port, root_path=f'/user/{username}/proxy/absolute/{port}')\n",
+    "print(f'\\nOpen the following URL in your webbrowser:\\n{absolute_url}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b049395-b496-4412-abb7-1c5a7f592cce",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:fc68bcd6-bca8-4376-b231-0c8c84c532e4 tags:
+
+## Gradio
+
+%% Cell type:markdown id:c96197c4-9bdf-4895-a209-f97a30660b66 tags:
+
+[Gradio](https://www.gradio.app) can enable simple web interfaces to your software. In this example, we are using Gradio to get a simple chat interface to a large language model.
+
+%% Cell type:code id:18b34bbb-3946-4e31-8c51-c50f66d327bd tags:
+
+``` python
+# Import necessary libraries
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
+import gradio as gr
+import os
+import random
+```
+
+%% Cell type:code id:744de243-d842-4fd8-a2d9-e0ebfb85f91e tags:
+
+``` python
+# Use a random TCP port:
+port = random.randint(10000, 50000)
+# Get username
+username = os.environ['USER']
+# Construct URL:
+relative_url = f'/user/{username}/proxy/absolute/{port}/'  # Needs to start with '/'
+absolute_url = f'https://jupyterhub.vsc.ac.at{relative_url}'
+```
+
+%% Cell type:code id:114aa1d0-c7c2-49f5-878b-de7f76a6eb9e tags:
+
+``` python
+# Load tokenizer and model and create a pipeline that can be used for inference:
+model_name = '/gpfs/data/fs70824/LLMs_models_datasets/models/microsoft--phi-3.5-mini-instruct'
+# model_name = 'microsoft/Phi-3.5-mini-instruct'
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map='cuda',
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+)
+pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)
+```
+
+%% Output
+
+    `flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
+    Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
+
+
+    Device set to use cpu
+
+%% Cell type:code id:96fefd3b-f679-4b3a-b0de-f519c24249bd tags:
+
+``` python
+# Prepare a function that takes chatbot questions and returns the answer from the LLM:
+def get_answer(question, history=[]):
+    history.append(
+        {'role': 'user', 'content': question}
+    )
+    result = pipe(history, max_new_tokens=500, return_full_text=False)
+    return result[0]['generated_text'].strip()
+```
+
+%% Cell type:code id:53ae1640-5515-45d1-a8ff-494d72662c62 tags:
+
+``` python
+# Create a Gradio ChatInterface and launch it:
+chat_interface = gr.ChatInterface(get_answer, type='messages')
+chat_interface.launch(share=False, inline=False, server_name='127.0.0.1', server_port=port, root_path=f'/user/{username}/proxy/absolute/{port}')
+print(f'\nOpen the following URL in your webbrowser:\n{absolute_url}')
+```
+
+%% Output
+
+    which: no node in (/opt/sw/jupyterhub/envs/conda/vsc5/jupyterhub-llm-training-v4/bin:/opt/sw/conda/miniconda3-24.1.2/condabin:/opt/sw/cuda-zen/spack-0.19.0/bin:/home/fs71550/mpfister/.local/bin:/home/fs71550/mpfister/bin:/usr/share/Modules/bin:/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/lpp/mmfs/bin:/opt/sw/slurm/x86_64/alma8.8/22-05-2-1/bin:/opt/sw/slurm/x86_64/alma8.8/22-05-2-1/sbin:/opt/sw/vsc_modules/modules-4.2.2/bin:/opt/sw/vsc4/VSC/x86_64/generic/bin:/opt/sw/tools:/opt/sw/conda/miniconda3/condabin:/opt/sw/conda/miniconda3/bin)
+
+    * Running on local URL:  http://127.0.0.1:39953
+    
+    To create a public link, set `share=True` in `launch()`.
+    
+    Open the following URL in your webbrowser:
+    https://jupyterhub.vsc.ac.at/user/mpfister/proxy/absolute/39953/
+
+%% Cell type:code id:2b049395-b496-4412-abb7-1c5a7f592cce tags:
+
+``` python
+```