*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects
Commit b81f3500 authored by Pfister, Martin's avatar Pfister, Martin
Browse files

Remove prepare_model_for_kbit_training(...) and get_peft_model(...), because...

Remove prepare_model_for_kbit_training(...) and get_peft_model(...), because SFTTrainer already cares for it.
parent ba0522bf
Branches
No related tags found
No related merge requests found
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import torch import torch
from accelerate import PartialState from accelerate import PartialState
from datasets import load_dataset from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer from trl import SFTConfig, SFTTrainer
# import wandb # import wandb
...@@ -41,13 +41,6 @@ model = AutoModelForCausalLM.from_pretrained( ...@@ -41,13 +41,6 @@ model = AutoModelForCausalLM.from_pretrained(
model.config.pad_token_id = tokenizer.pad_token_id model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False model.config.use_cache = False
model.config.pretraining_tp = 1 # disable tensor parallelism model.config.pretraining_tp = 1 # disable tensor parallelism
# Gradient checkpointing improves memory efficiency, but slows down training,
# e.g. Mistral 7B with PEFT using bitsandbytes:
# - enabled: 11 GB GPU RAM and 12 samples/second
# - disabled: 40 GB GPU RAM and 8 samples/second
gradient_checkpointing = True
model = prepare_model_for_kbit_training(model,
use_gradient_checkpointing=gradient_checkpointing)
peft_config = LoraConfig( peft_config = LoraConfig(
task_type='CAUSAL_LM', task_type='CAUSAL_LM',
...@@ -58,11 +51,6 @@ peft_config = LoraConfig( ...@@ -58,11 +51,6 @@ peft_config = LoraConfig(
target_modules='all-linear', target_modules='all-linear',
) )
model = get_peft_model(model, peft_config)
if process_index == 0: # Only print in first process.
model.print_trainable_parameters()
project_name = 'llama3.1-70b-medmcqa' project_name = 'llama3.1-70b-medmcqa'
run_name = '1' run_name = '1'
# notes = '' # notes = ''
...@@ -77,7 +65,10 @@ training_arguments = SFTConfig( ...@@ -77,7 +65,10 @@ training_arguments = SFTConfig(
output_dir=f'{project_name}-{run_name}', output_dir=f'{project_name}-{run_name}',
per_device_train_batch_size=8, per_device_train_batch_size=8,
gradient_accumulation_steps=1, gradient_accumulation_steps=1,
gradient_checkpointing=gradient_checkpointing, gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
# e.g. Mistral 7B with PEFT using bitsandbytes:
# - enabled: 11 GB GPU RAM and 12 samples/second
# - disabled: 40 GB GPU RAM and 8 samples/second
gradient_checkpointing_kwargs={'use_reentrant': False}, # Use newer implementation that will become the default. gradient_checkpointing_kwargs={'use_reentrant': False}, # Use newer implementation that will become the default.
ddp_find_unused_parameters=False, # Set to False when using gradient checkpointing to suppress warning message. ddp_find_unused_parameters=False, # Set to False when using gradient checkpointing to suppress warning message.
log_level_replica='error', # Disable warnings in all but the first process. log_level_replica='error', # Disable warnings in all but the first process.
...@@ -108,6 +99,10 @@ trainer = SFTTrainer( ...@@ -108,6 +99,10 @@ trainer = SFTTrainer(
packing=False, packing=False,
) )
if process_index == 0: # Only print in first process.
if hasattr(trainer.model, "print_trainable_parameters"):
trainer.model.print_trainable_parameters()
result = trainer.train() result = trainer.train()
# Print statistics in first process only: # Print statistics in first process only:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import torch import torch
from accelerate import PartialState from accelerate import PartialState
from datasets import load_dataset from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer from trl import SFTConfig, SFTTrainer
# import wandb # import wandb
...@@ -43,13 +43,6 @@ model = AutoModelForCausalLM.from_pretrained( ...@@ -43,13 +43,6 @@ model = AutoModelForCausalLM.from_pretrained(
model.config.pad_token_id = tokenizer.pad_token_id model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False model.config.use_cache = False
model.config.pretraining_tp = 1 # disable tensor parallelism model.config.pretraining_tp = 1 # disable tensor parallelism
# Gradient checkpointing improves memory efficiency, but slows down training,
# e.g. Mistral 7B with PEFT using bitsandbytes:
# - enabled: 11 GB GPU RAM and 12 samples/second
# - disabled: 40 GB GPU RAM and 8 samples/second
gradient_checkpointing = True
model = prepare_model_for_kbit_training(model,
use_gradient_checkpointing=gradient_checkpointing)
peft_config = LoraConfig( peft_config = LoraConfig(
task_type='CAUSAL_LM', task_type='CAUSAL_LM',
...@@ -60,11 +53,6 @@ peft_config = LoraConfig( ...@@ -60,11 +53,6 @@ peft_config = LoraConfig(
target_modules='all-linear', target_modules='all-linear',
) )
model = get_peft_model(model, peft_config)
if process_index == 0: # Only print in first process.
model.print_trainable_parameters()
project_name = 'mistral7b-medmcqa' project_name = 'mistral7b-medmcqa'
run_name = '1' run_name = '1'
# notes = '' # notes = ''
...@@ -79,7 +67,10 @@ training_arguments = SFTConfig( ...@@ -79,7 +67,10 @@ training_arguments = SFTConfig(
output_dir=f'{project_name}-{run_name}', output_dir=f'{project_name}-{run_name}',
per_device_train_batch_size=8, per_device_train_batch_size=8,
gradient_accumulation_steps=1, gradient_accumulation_steps=1,
gradient_checkpointing=gradient_checkpointing, gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
# e.g. Mistral 7B with PEFT using bitsandbytes:
# - enabled: 11 GB GPU RAM and 12 samples/second
# - disabled: 40 GB GPU RAM and 8 samples/second
gradient_checkpointing_kwargs={'use_reentrant': False}, # Use newer implementation that will become the default. gradient_checkpointing_kwargs={'use_reentrant': False}, # Use newer implementation that will become the default.
ddp_find_unused_parameters=False, # Set to False when using gradient checkpointing to suppress warning message. ddp_find_unused_parameters=False, # Set to False when using gradient checkpointing to suppress warning message.
log_level_replica='error', # Disable warnings in all but the first process. log_level_replica='error', # Disable warnings in all but the first process.
...@@ -110,6 +101,10 @@ trainer = SFTTrainer( ...@@ -110,6 +101,10 @@ trainer = SFTTrainer(
packing=False, packing=False,
) )
if process_index == 0: # Only print in first process.
if hasattr(trainer.model, "print_trainable_parameters"):
trainer.model.print_trainable_parameters()
result = trainer.train() result = trainer.train()
# Print statistics in first process only: # Print statistics in first process only:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import torch import torch
from accelerate import PartialState from accelerate import PartialState
from datasets import load_dataset from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
from trl import SFTConfig, SFTTrainer from trl import SFTConfig, SFTTrainer
from auto_gptq.nn_modules import qlinear from auto_gptq.nn_modules import qlinear
...@@ -60,13 +60,6 @@ model = AutoModelForCausalLM.from_pretrained( ...@@ -60,13 +60,6 @@ model = AutoModelForCausalLM.from_pretrained(
model.config.pad_token_id = tokenizer.pad_token_id model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False model.config.use_cache = False
model.config.pretraining_tp = 1 # disable tensor parallelism model.config.pretraining_tp = 1 # disable tensor parallelism
# Gradient checkpointing improves memory efficiency, but slows down training,
# e.g. Mistral 7B with PEFT using bitsandbytes:
# - enabled: 11 GB GPU RAM and 12 samples/second
# - disabled: 40 GB GPU RAM and 8 samples/second
gradient_checkpointing = True
model = prepare_model_for_kbit_training(model,
use_gradient_checkpointing=gradient_checkpointing)
peft_config = LoraConfig( peft_config = LoraConfig(
task_type='CAUSAL_LM', task_type='CAUSAL_LM',
...@@ -78,11 +71,6 @@ peft_config = LoraConfig( ...@@ -78,11 +71,6 @@ peft_config = LoraConfig(
target_modules=find_linear_modules(model), target_modules=find_linear_modules(model),
) )
model = get_peft_model(model, peft_config)
if process_index == 0: # Only print in first process.
model.print_trainable_parameters()
project_name = 'mistral7b-medmcqa' project_name = 'mistral7b-medmcqa'
run_name = '1' run_name = '1'
# notes = '' # notes = ''
...@@ -97,7 +85,10 @@ training_arguments = SFTConfig( ...@@ -97,7 +85,10 @@ training_arguments = SFTConfig(
output_dir=f'{project_name}-{run_name}', output_dir=f'{project_name}-{run_name}',
per_device_train_batch_size=8, per_device_train_batch_size=8,
gradient_accumulation_steps=1, gradient_accumulation_steps=1,
gradient_checkpointing=gradient_checkpointing, gradient_checkpointing=True, # Gradient checkpointing improves memory efficiency, but slows down training,
# e.g. Mistral 7B with PEFT using bitsandbytes:
# - enabled: 11 GB GPU RAM and 12 samples/second
# - disabled: 40 GB GPU RAM and 8 samples/second
gradient_checkpointing_kwargs={'use_reentrant': False}, # Use newer implementation that will become the default. gradient_checkpointing_kwargs={'use_reentrant': False}, # Use newer implementation that will become the default.
ddp_find_unused_parameters=False, # Set to False when using gradient checkpointing to suppress warning message. ddp_find_unused_parameters=False, # Set to False when using gradient checkpointing to suppress warning message.
log_level_replica='error', # Disable warnings in all but the first process. log_level_replica='error', # Disable warnings in all but the first process.
...@@ -128,6 +119,10 @@ trainer = SFTTrainer( ...@@ -128,6 +119,10 @@ trainer = SFTTrainer(
packing=False, packing=False,
) )
if process_index == 0: # Only print in first process.
if hasattr(trainer.model, "print_trainable_parameters"):
trainer.model.print_trainable_parameters()
result = trainer.train() result = trainer.train()
# Print statistics in first process only: # Print statistics in first process only:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment