unsloth
unsloth copied to clipboard
Can't Train Phi3 on Kaggle (ValueError: Pointer argument (at 2) cannot be accessed from Triton (cpu tensor?))
I'm trying to train Phi3 on kaggle. I have already trained it for 300 steps on google colab no issues and saved the model on wandb. then i downloaded the model from wandb to train in kaggle but i keep on getting this error whenver i try to train it. Can anyone help me out with it? It seems the error is coming from triton. I've installed the dependencies as instructed in the notebook.
%%capture
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0
ValueError Traceback (most recent call last)
Cell In[12], line 1
----> 1 trainer_stats = trainer.train(resume_from_checkpoint = True)
File /opt/conda/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
359 self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
363 # After training we make sure to retrieve back the original forward pass method
364 # for the embedding layer by removing the forward post hook.
365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1780, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1778 hf_hub_utils.enable_progress_bars()
1779 else:
-> 1780 return inner_training_loop(
1781 args=args,
1782 resume_from_checkpoint=resume_from_checkpoint,
1783 trial=trial,
1784 ignore_keys_for_eval=ignore_keys_for_eval,
1785 )
File <string>:355, in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3036, in Trainer.training_step(self, model, inputs)
3033 return loss_mb.reduce_mean().detach().to(self.args.device)
3035 with self.compute_loss_context_manager():
-> 3036 loss = self.compute_loss(model, inputs)
3038 if self.args.n_gpu > 1:
3039 loss = loss.mean() # mean() to average on multi-gpu parallel training
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3059, in Trainer.compute_loss(self, model, inputs, return_outputs)
3057 else:
3058 labels = None
-> 3059 outputs = model(**inputs)
3060 # Save past state if it exists
3061 # TODO: this needs to be fixed and made cleaner later.
3062 if self.args.past_index >= 0:
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs)
183 return self.module(*inputs[0], **module_kwargs[0])
184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
186 return self.gather(outputs, self.output_device)
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
--> 200 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
106 output = results[i]
107 if isinstance(output, ExceptionWrapper):
--> 108 output.reraise()
109 outputs.append(output)
110 return outputs
File /opt/conda/lib/python3.10/site-packages/torch/_utils.py:722, in ExceptionWrapper.reraise(self)
718 except TypeError:
719 # If the exception takes multiple arguments, don't try to
720 # instantiate since we don't know how to
721 raise RuntimeError(msg) from None
--> 722 raise exception
ValueError: Caught ValueError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
output = module(*input, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py", line 882, in PeftModelForCausalLM_fast_forward
return self.base_model(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
return self.model.forward(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/unsloth/models/mistral.py", line 213, in MistralForCausalLM_fast_forward
outputs = self.model(
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py", line 650, in LlamaModel_fast_forward
hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 553, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 115, in decorate_fwd
return fwd(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/unsloth/models/_utils.py", line 369, in forward
(output,) = forward_function(hidden_states, *args)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
return forward_call(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py", line 432, in LlamaDecoderLayer_fast_forward
hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
File "/opt/conda/lib/python3.10/site-packages/unsloth/kernels/rms_layernorm.py", line 190, in fast_rms_layernorm
out = Fast_RMS_Layernorm.apply(X, W, eps, gemma)
File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 553, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/opt/conda/lib/python3.10/site-packages/unsloth/kernels/rms_layernorm.py", line 144, in forward
fx[(n_rows,)](
File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 550, in run
bin.c_wrapper(
ValueError: Pointer argument (at 2) cannot be accessed from Triton (cpu tensor?)
Did you turn on the GPU?
Did you turn on the GPU?
Yes. I had Gpu T4 x2 for the sessions.
@ShazzadAliShozol Try it:
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"
# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0
import os
import torch
from unsloth import FastLanguageModel
os.environ["WANDB_DISABLED"] = "true"
repo_id = "YOUR_REPO_ID"
hub_token = "YOUR_HUB_TOKEN"
model_id = "unsloth/Phi-3-mini-4k-instruct"
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_id, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha = 64,
lora_dropout = 0,
bias = "none",
use_gradient_checkpointing = "unsloth",
random_state = 3407,
use_rslora = False,
loftq_config = None,
)
model.config.use_cache = False
model.print_trainable_parameters()
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
# Must add EOS_TOKEN, otherwise your generation will go on forever!
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return { "text" : texts}
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True)
from trl import SFTTrainer
from transformers import TrainingArguments
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
max_seq_length = max_seq_length,
dataset_num_proc = 2,
dataset_text_field = "text",
packing = False,
args = TrainingArguments(
output_dir = repo_id,
do_eval = False,
save_total_limit = 1,
num_train_epochs = 1,
logging_strategy = "steps",
logging_steps = 3,
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 0.1,
learning_rate = 2e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "cosine",
seed = 0,
hub_private_repo=True,
hub_token=hub_token
),
)
trainer_stats = trainer.train()
@ShazzadAliShozol Try it:
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121 !pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git" # Temporary fix for https://github.com/huggingface/datasets/issues/6753 !pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0 import os import torch from unsloth import FastLanguageModel os.environ["WANDB_DISABLED"] = "true" repo_id = "YOUR_REPO_ID" hub_token = "YOUR_HUB_TOKEN" model_id = "unsloth/Phi-3-mini-4k-instruct" max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. model, tokenizer = FastLanguageModel.from_pretrained( model_name = model_id, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, ) tokenizer.pad_token = tokenizer.unk_token # use unk rather than eos token to prevent endless generation tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) tokenizer.padding_side = 'right' model = FastLanguageModel.get_peft_model( model, r = 16, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha = 64, lora_dropout = 0, bias = "none", use_gradient_checkpointing = "unsloth", random_state = 3407, use_rslora = False, loftq_config = None, ) model.config.use_cache = False model.print_trainable_parameters() alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN def formatting_prompts_func(examples): instructions = examples["instruction"] inputs = examples["input"] outputs = examples["output"] texts = [] for instruction, input, output in zip(instructions, inputs, outputs): # Must add EOS_TOKEN, otherwise your generation will go on forever! text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN texts.append(text) return { "text" : texts} from datasets import load_dataset dataset = load_dataset("yahma/alpaca-cleaned", split = "train") dataset = dataset.map(formatting_prompts_func, batched = True) from trl import SFTTrainer from transformers import TrainingArguments trainer = SFTTrainer( model = model, tokenizer = tokenizer, train_dataset = dataset, max_seq_length = max_seq_length, dataset_num_proc = 2, dataset_text_field = "text", packing = False, args = TrainingArguments( output_dir = repo_id, do_eval = False, save_total_limit = 1, num_train_epochs = 1, logging_strategy = "steps", logging_steps = 3, per_device_train_batch_size = 2, gradient_accumulation_steps = 4, warmup_steps = 0.1, learning_rate = 2e-4, fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(), optim = "adamw_8bit", weight_decay = 0.01, lr_scheduler_type = "cosine", seed = 0, hub_private_repo=True, hub_token=hub_token ), ) trainer_stats = trainer.train()
Yeah It's working now. Thank you very much. Since the problem is fixed I'll close the issue