unsloth Can't Train Phi3 on Kaggle (ValueError: Pointer argument (at 2) cannot be accessed from Triton (cpu tensor?))

I'm trying to train Phi3 on kaggle. I have already trained it for 300 steps on google colab no issues and saved the model on wandb. then i downloaded the model from wandb to train in kaggle but i keep on getting this error whenver i try to train it. Can anyone help me out with it? It seems the error is coming from triton. I've installed the dependencies as instructed in the notebook.

 %%capture
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0


ValueError                                Traceback (most recent call last)
Cell In[12], line 1
----> 1 trainer_stats = trainer.train(resume_from_checkpoint = True)

File /opt/conda/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
    358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
    359     self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
    363 # After training we make sure to retrieve back the original forward pass method
    364 # for the embedding layer by removing the forward post hook.
    365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1780, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1778         hf_hub_utils.enable_progress_bars()
   1779 else:
-> 1780     return inner_training_loop(
   1781         args=args,
   1782         resume_from_checkpoint=resume_from_checkpoint,
   1783         trial=trial,
   1784         ignore_keys_for_eval=ignore_keys_for_eval,
   1785     )

File <string>:355, in _fast_inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3036, in Trainer.training_step(self, model, inputs)
   3033     return loss_mb.reduce_mean().detach().to(self.args.device)
   3035 with self.compute_loss_context_manager():
-> 3036     loss = self.compute_loss(model, inputs)
   3038 if self.args.n_gpu > 1:
   3039     loss = loss.mean()  # mean() to average on multi-gpu parallel training

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3059, in Trainer.compute_loss(self, model, inputs, return_outputs)
   3057 else:
   3058     labels = None
-> 3059 outputs = model(**inputs)
   3060 # Save past state if it exists
   3061 # TODO: this needs to be fixed and made cleaner later.
   3062 if self.args.past_index >= 0:

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs)
    183     return self.module(*inputs[0], **module_kwargs[0])
    184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
    186 return self.gather(outputs, self.output_device)

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
    199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
--> 200     return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
    106     output = results[i]
    107     if isinstance(output, ExceptionWrapper):
--> 108         output.reraise()
    109     outputs.append(output)
    110 return outputs

File /opt/conda/lib/python3.10/site-packages/torch/_utils.py:722, in ExceptionWrapper.reraise(self)
    718 except TypeError:
    719     # If the exception takes multiple arguments, don't try to
    720     # instantiate since we don't know how to
    721     raise RuntimeError(msg) from None
--> 722 raise exception

ValueError: Caught ValueError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py", line 882, in PeftModelForCausalLM_fast_forward
    return self.base_model(
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 161, in forward
    return self.model.forward(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/unsloth/models/mistral.py", line 213, in MistralForCausalLM_fast_forward
    outputs = self.model(
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py", line 650, in LlamaModel_fast_forward
    hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 553, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/opt/conda/lib/python3.10/site-packages/torch/cuda/amp/autocast_mode.py", line 115, in decorate_fwd
    return fwd(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/unsloth/models/_utils.py", line 369, in forward
    (output,) = forward_function(hidden_states, *args)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py", line 432, in LlamaDecoderLayer_fast_forward
    hidden_states = fast_rms_layernorm(self.input_layernorm, hidden_states)
  File "/opt/conda/lib/python3.10/site-packages/unsloth/kernels/rms_layernorm.py", line 190, in fast_rms_layernorm
    out = Fast_RMS_Layernorm.apply(X, W, eps, gemma)
  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 553, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/opt/conda/lib/python3.10/site-packages/unsloth/kernels/rms_layernorm.py", line 144, in forward
    fx[(n_rows,)](
  File "/opt/conda/lib/python3.10/site-packages/triton/runtime/jit.py", line 550, in run
    bin.c_wrapper(
ValueError: Pointer argument (at 2) cannot be accessed from Triton (cpu tensor?)

May 09 '24 20:05 ShazzadAliShozol

Did you turn on the GPU?

May 10 '24 19:05 danielhanchen

Did you turn on the GPU?

Yes. I had Gpu T4 x2 for the sessions.

May 11 '24 13:05 ShazzadAliShozol

@ShazzadAliShozol Try it:

!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
import torch
from unsloth import FastLanguageModel

os.environ["WANDB_DISABLED"] = "true"
repo_id = "YOUR_REPO_ID"
hub_token = "YOUR_HUB_TOKEN"
model_id = "unsloth/Phi-3-mini-4k-instruct"
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)
model.config.use_cache = False
model.print_trainable_parameters()

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True)

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    dataset_text_field = "text",
    packing = False,
    args = TrainingArguments(
        output_dir = repo_id,
        do_eval = False,
        save_total_limit = 1,
        num_train_epochs = 1,
        logging_strategy = "steps",
        logging_steps = 3,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 0.1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 0,
        hub_private_repo=True,
        hub_token=hub_token
    ),
)

trainer_stats = trainer.train()

May 12 '24 14:05 thewebscraping

@ShazzadAliShozol Try it:

!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
import torch
from unsloth import FastLanguageModel

os.environ["WANDB_DISABLED"] = "true"
repo_id = "YOUR_REPO_ID"
hub_token = "YOUR_HUB_TOKEN"
model_id = "unsloth/Phi-3-mini-4k-instruct"
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 64,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)
model.config.use_cache = False
model.print_trainable_parameters()

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True)

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    dataset_text_field = "text",
    packing = False,
    args = TrainingArguments(
        output_dir = repo_id,
        do_eval = False,
        save_total_limit = 1,
        num_train_epochs = 1,
        logging_strategy = "steps",
        logging_steps = 3,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 0.1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 0,
        hub_private_repo=True,
        hub_token=hub_token
    ),
)

trainer_stats = trainer.train()

Yeah It's working now. Thank you very much. Since the problem is fixed I'll close the issue

May 13 '24 07:05 ShazzadAliShozol

unsloth unsloth copied to clipboard

Can't Train Phi3 on Kaggle (ValueError: Pointer argument (at 2) cannot be accessed from Triton (cpu tensor?))

unsloth
unsloth copied to clipboard