[BUG] Error pre-training BERT
Hi guys,
I am following the Megatron-LM example to pre-train a BERT model but I'm getting this error:
[rank0]: Traceback (most recent call last):
[rank0]: File "/root/Megatron-LM/pretrain_bert.py", line 193, in <module>
[rank0]: pretrain(train_valid_test_datasets_provider, model_provider,
[rank0]: File "/root/Megatron-LM/megatron/training/training.py", line 274, in pretrain
[rank0]: iteration, num_floating_point_operations_so_far = train(
[rank0]: ^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/training/training.py", line 1027, in train
[rank0]: train_step(forward_step_func,
[rank0]: File "/root/Megatron-LM/megatron/training/training.py", line 550, in train_step
[rank0]: losses_reduced = forward_backward_func(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 381, in forward_backward_no_pipelining
[rank0]: output_tensor, num_tokens = forward_step(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 206, in forward_step
[rank0]: output_tensor, loss_func = forward_step_func(data_iterator, model)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/pretrain_bert.py", line 139, in forward_step
[rank0]: output_tensor = model(tokens, padding_mask,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/distributed/distributed_data_parallel.py", line 180, in forward
[rank0]: return self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/legacy/model/module.py", line 190, in forward
[rank0]: outputs = self.module(*inputs, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/models/bert/bert_model.py", line 237, in forward
[rank0]: hidden_states = self.encoder(
[rank0]: ^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 383, in forward
[rank0]: hidden_states, context = layer(
[rank0]: ^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 178, in forward
[rank0]: attention_output_with_bias = self.self_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/transformer/attention.py", line 315, in forward
[rank0]: core_attn_out = self.core_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/Megatron-LM/megatron/core/transformer/custom_layers/transformer_engine.py", line 514, in forward
[rank0]: core_attn_out = super().forward(
[rank0]: ^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/transformer_engine/pytorch/attention.py", line 5267, in forward
[rank0]: return self.fused_attention(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 600, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/transformer_engine/pytorch/attention.py", line 4157, in forward
[rank0]: cu_seqlens_q = get_cu_seqlens(attention_mask)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/opt/conda/lib/python3.12/site-packages/transformer_engine/pytorch/attention.py", line 247, in get_cu_seqlens
[rank0]: cu_seqlens = torch.cat((zero, cu_seqlens))
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: RuntimeError: Tensors must have same number of dimensions: got 1 and 2
I'm using transformer_engine==1.8.0+3ec998e with megatron core_v0.7.0. My pre-training script is like this:
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_FLASH_ATTN=0
CHECKPOINT_PATH="./checkpoints"
VOCAB_FILE="./vocabs/bert-base-cased-vocab.txt"
DATA_PATH="my-bert_text_sentence"
BERT_ARGS="
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--micro-batch-size 4 \
--global-batch-size 8 \
--lr 0.0001 \
--train-iters 2000000 \
--lr-decay-iters 990000 \
--lr-decay-style linear \
--min-lr 0.00001 \
--weight-decay 1e-2 \
--lr-warmup-fraction .01 \
--clip-grad 1.0 \
--fp16
--use-mcore-models
"
DATA_ARGS="
--data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 100 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10
"
torchrun pretrain_bert.py \
$BERT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH
I'm also interested in using the BERT cased checkpoint instead of pre-training from scratch.
Thanks in advance.
same issue here
Marking as stale. No activity in 60 days.
Hi guys,
Just wondering if this issue has been resolved? I still face the same issue on my side with version core_v0.12.1
This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.