Remove prepare_model_for_kbit_training(...) and get_peft_model(...), because...

Remove prepare_model_for_kbit_training(...) and get_peft_model(...), because SFTTrainer already cares for it.

Remove prepare_model_for_kbit_training(...) and get_peft_model(...), because...
b81f3500 · Pfister, Martin · ba0522bf · b81f3500 · b81f3500 · b81f3500
Commit b81f3500 authored 1 year ago by Pfister, Martin
--- a/llama3.1-70b-bnb/llama3.1-70b_train.py
+++ b/llama3.1-70b-bnb/llama3.1-70b_train.py
@@ -3,7 +3,7 @@
 import torch
 from accelerate import PartialState
 from datasets import load_dataset
-from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
+from peft import LoraConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTConfig, SFTTrainer
 # import wandb
@@ -41,13 +41,6 @@ model = AutoModelForCausalLM.from_pretrained(
 model.config.pad_token_id = tokenizer.pad_token_id
 model.config.use_cache = False
 model.config.pretraining_tp = 1  # disable tensor parallelism
-# Gradient checkpointing improves memory efficiency, but slows down training,
-# e.g. Mistral 7B with PEFT using bitsandbytes:
-# - enabled: 11 GB GPU RAM and 12 samples/second
-# - disabled: 40 GB GPU RAM and 8 samples/second
-gradient_checkpointing = True
-model = prepare_model_for_kbit_training(model,
-    use_gradient_checkpointing=gradient_checkpointing)
 peft_config = LoraConfig(
    task_type='CAUSAL_LM',
@@ -58,11 +51,6 @@ peft_config = LoraConfig(
    target_modules='all-linear',
 )
-model = get_peft_model(model, peft_config)
-if process_index == 0:  # Only print in first process.
-    model.print_trainable_parameters()
 project_name = 'llama3.1-70b-medmcqa'
 run_name = '1'
 # notes = ''
@@ -77,7 +65,10 @@ training_arguments = SFTConfig(
    output_dir=f'{project_name}-{run_name}',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
-    gradient_checkpointing=gradient_checkpointing,
+    gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
+        # e.g. Mistral 7B with PEFT using bitsandbytes:
+        # - enabled: 11 GB GPU RAM and 12 samples/second
+        # - disabled: 40 GB GPU RAM and 8 samples/second
    gradient_checkpointing_kwargs={'use_reentrant': False},  # Use newer implementation that will become the default.
    ddp_find_unused_parameters=False,  # Set to False when using gradient checkpointing to suppress warning message.
    log_level_replica='error',  # Disable warnings in all but the first process.
@@ -108,6 +99,10 @@ trainer = SFTTrainer(
    packing=False,
 )
+if process_index == 0:  # Only print in first process.
+    if hasattr(trainer.model, "print_trainable_parameters"):
+        trainer.model.print_trainable_parameters()
 result = trainer.train()
 # Print statistics in first process only:

--- a/mistral7b-bnb/mistral7b_train.py
+++ b/mistral7b-bnb/mistral7b_train.py
@@ -3,7 +3,7 @@
 import torch
 from accelerate import PartialState
 from datasets import load_dataset
-from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
+from peft import LoraConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from trl import SFTConfig, SFTTrainer
 # import wandb
@@ -43,13 +43,6 @@ model = AutoModelForCausalLM.from_pretrained(
 model.config.pad_token_id = tokenizer.pad_token_id
 model.config.use_cache = False
 model.config.pretraining_tp = 1  # disable tensor parallelism
-# Gradient checkpointing improves memory efficiency, but slows down training,
-# e.g. Mistral 7B with PEFT using bitsandbytes:
-# - enabled: 11 GB GPU RAM and 12 samples/second
-# - disabled: 40 GB GPU RAM and 8 samples/second
-gradient_checkpointing = True
-model = prepare_model_for_kbit_training(model,
-    use_gradient_checkpointing=gradient_checkpointing)
 peft_config = LoraConfig(
    task_type='CAUSAL_LM',
@@ -60,11 +53,6 @@ peft_config = LoraConfig(
    target_modules='all-linear',
 )
-model = get_peft_model(model, peft_config)
-if process_index == 0:  # Only print in first process.
-    model.print_trainable_parameters()
 project_name = 'mistral7b-medmcqa'
 run_name = '1'
 # notes = ''
@@ -79,7 +67,10 @@ training_arguments = SFTConfig(
    output_dir=f'{project_name}-{run_name}',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
-    gradient_checkpointing=gradient_checkpointing,
+    gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
+        # e.g. Mistral 7B with PEFT using bitsandbytes:
+        # - enabled: 11 GB GPU RAM and 12 samples/second
+        # - disabled: 40 GB GPU RAM and 8 samples/second
    gradient_checkpointing_kwargs={'use_reentrant': False},  # Use newer implementation that will become the default.
    ddp_find_unused_parameters=False,  # Set to False when using gradient checkpointing to suppress warning message.
    log_level_replica='error',  # Disable warnings in all but the first process.
@@ -110,6 +101,10 @@ trainer = SFTTrainer(
    packing=False,
 )
+if process_index == 0:  # Only print in first process.
+    if hasattr(trainer.model, "print_trainable_parameters"):
+        trainer.model.print_trainable_parameters()
 result = trainer.train()
 # Print statistics in first process only:

--- a/mistral7b-gptq/mistral7b_train.py
+++ b/mistral7b-gptq/mistral7b_train.py
@@ -3,7 +3,7 @@
 import torch
 from accelerate import PartialState
 from datasets import load_dataset
-from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
+from peft import LoraConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 from trl import SFTConfig, SFTTrainer
 from auto_gptq.nn_modules import qlinear
@@ -60,13 +60,6 @@ model = AutoModelForCausalLM.from_pretrained(
 model.config.pad_token_id = tokenizer.pad_token_id
 model.config.use_cache = False
 model.config.pretraining_tp = 1  # disable tensor parallelism
-# Gradient checkpointing improves memory efficiency, but slows down training,
-# e.g. Mistral 7B with PEFT using bitsandbytes:
-# - enabled: 11 GB GPU RAM and 12 samples/second
-# - disabled: 40 GB GPU RAM and 8 samples/second
-gradient_checkpointing = True
-model = prepare_model_for_kbit_training(model,
-    use_gradient_checkpointing=gradient_checkpointing)
 peft_config = LoraConfig(
    task_type='CAUSAL_LM',
@@ -78,11 +71,6 @@ peft_config = LoraConfig(
    target_modules=find_linear_modules(model),
 )
-model = get_peft_model(model, peft_config)
-if process_index == 0:  # Only print in first process.
-    model.print_trainable_parameters()
 project_name = 'mistral7b-medmcqa'
 run_name = '1'
 # notes = ''
@@ -97,7 +85,10 @@ training_arguments = SFTConfig(
    output_dir=f'{project_name}-{run_name}',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
-    gradient_checkpointing=gradient_checkpointing,
+    gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
+        # e.g. Mistral 7B with PEFT using bitsandbytes:
+        # - enabled: 11 GB GPU RAM and 12 samples/second
+        # - disabled: 40 GB GPU RAM and 8 samples/second
    gradient_checkpointing_kwargs={'use_reentrant': False},  # Use newer implementation that will become the default.
    ddp_find_unused_parameters=False,  # Set to False when using gradient checkpointing to suppress warning message.
    log_level_replica='error',  # Disable warnings in all but the first process.
@@ -128,6 +119,10 @@ trainer = SFTTrainer(
    packing=False,
 )
+if process_index == 0:  # Only print in first process.
+    if hasattr(trainer.model, "print_trainable_parameters"):
+        trainer.model.print_trainable_parameters()
 result = trainer.train()
 # Print statistics in first process only: