Phi3 masked_fill_ in-place support
🚀 Model / language coverage
Trying to run the HF model "microsoft/Phi-3-mini-4k-instruct" hits an issue translating an in-place op.
Traceback (most recent call last):
File "/home/tfogal/scratch/tfx-tests/phi3/phi3.py", line 70, in <module>
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 465, in _fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/tfogal/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-mini-4k-instruct/0a67737cc96d2554230f90338b163bc6380a2a85/modeling_phi3.py", line 1243, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/tfogal/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-mini-4k-instruct/0a67737cc96d2554230f90338b163bc6380a2a85/modeling_phi3.py", line 1091, in forward
attention_mask = _prepare_4d_causal_attention_mask(
File "/home/tfogal/env/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py", line 295, in _prepare_4d_causal_attention_mask
def _prepare_4d_causal_attention_mask(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py", line 632, in _fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/fx/graph_module.py", line 784, in call_wrapped
return self._wrapped_call(self, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/fx/graph_module.py", line 361, in __call__
raise e
File "/usr/local/lib/python3.10/dist-packages/torch/fx/graph_module.py", line 348, in __call__
return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "<eval_with_key>.6", line 5, in forward
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/tfogal/scratch/thunder/thunder/core/module.py", line 80, in forward
res = self._forward_fn(*args, **kwargs)
File "/home/tfogal/scratch/thunder/thunder/__init__.py", line 724, in fn_
cache_entry, inps, pro_to_epi = get_computation_and_inputs(*args, **kwargs)
File "/home/tfogal/scratch/thunder/thunder/core/langctxs.py", line 136, in _fn
result = fn(*args, **kwargs)
File "/home/tfogal/scratch/thunder/thunder/__init__.py", line 219, in cache_info_wrapper
res = fn(*args, **kwargs)
File "/home/tfogal/scratch/thunder/thunder/__init__.py", line 527, in get_computation_and_inputs
orig_to_view_swap_map = check_inplace_to_views(computation_trc)
File "/home/tfogal/scratch/thunder/thunder/core/functionalization.py", line 62, in check_inplace_to_views
check(
File "/home/tfogal/scratch/thunder/thunder/core/baseutils.py", line 107, in check
raise exception_type(s())
NotImplementedError: in-place op of `torch.Tensor.masked_fill_` to `torch.Tensor.to` output `<TensorProxy(name="mask_1", dtype=thunder.dtypes.bfloat16, shape=(1, 1))>` is not supported. It's unclear if the output of ('torch.flatten', 'torch.reshape', 'Tensor.reshape_as', 'torch.Tensor.to', 'torch.Tensor.contiguous') is a copy, a view, or the input itself, as per https://pytorch.org/docs/stable/tensor_view.html
The error comes from https://github.com/Lightning-AI/lightning-thunder/blob/fceb64efc93a80a27d38b8e84f0e2b5f132f3d2f/thunder/core/functionalization.py#L62-L71
Pitch
The issue blocks a model for integration.
Minimal Repro
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from datasets import load_dataset
import thunder
import thunder.dynamo
model_name = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype='auto'
)
model = torch.compile(model, backend=thunder.dynamo.ThunderCompiler())
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add a padding token to the tokenizer
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer)) # Resize the model's embeddings to accommodate new tokens
# Load a smaller dataset
dataset = load_dataset("tiny_shakespeare", split='train')
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True,
max_length=2)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Convert the dataset to PyTorch format and specify columns to return as tensors
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
# Create PyTorch DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=1, shuffle=False)
# Define optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_epochs * len(dataloader),
)
# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# Fine-tuning loop
model.train()
batch = next(dataloader.__iter__())
# Move input tensors to device
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
What is the trace when this happens? But we identified this as unclear behaviour, but I'm wondering if the .to is from the user code or from a decomposition.
This isn't technically the lack of support of masked_fill_
I believe this was fixed by #1292 (the network now runs, at least). Thanks, Masaki!