TransformerEngine
TransformerEngine copied to clipboard
Unit test failed due to "CUDA Error: operation not supported"
Run this command:
pytest TransformerEngine/tests/pytorch/test_float8tensor.py -k test_fp8_meta
and got this error:
cls = <class 'transformer_engine.pytorch.fp8.FP8GlobalStateManager'>, forward = True, fp8_weights = False
@classmethod
def reduce_and_update_fp8_tensors(
cls,
forward: bool = True,
fp8_weights: bool = False,
) -> None:
"""Concatenate, reduce, and split amaxes in the global buffer."""
for buffer_key, amax_buffer in cls.global_amax_buffer.items():
# Check for forward or backward reduction.
fwd_update, fp8_weights_update, autocast_key = cls.split_key_in_buffer(buffer_key)
if fwd_update != forward:
continue
# Only skip a forward update when `fp8_weights` is explicitly set to `True`
# (inside optimizer) and the current key is not an `fp8_weight_update` key.
# For other cases, we need to reduce because of activation tensors.
# TODO(ksivaman) consider separate weight and activation fp8_tensors.
if fwd_update and fp8_weights and not fp8_weights_update:
continue
if len(amax_buffer) == 0:
continue
# Retrieve autocast specific args and concat amaxes.
recipe, group = cls.autocast_arguments[autocast_key]
contiguous_amax = torch.cat(amax_buffer)
# Reduction.
if (
recipe.reduce_amax
and torch.distributed.is_initialized()
and torch.distributed.get_world_size(group=group) > 1
):
cls.reduce_tensor_across_group_op_max(contiguous_amax, group)
# Amax and scale update.
unfused_update = (
bool(int(os.getenv("NVTE_UNFUSED_FP8_UPDATE", "0")))
or callable(recipe.amax_compute_algo)
or callable(recipe.scaling_factor_compute_algo)
)
if not unfused_update:
> tex.fused_amax_and_scale_update_after_reduction(
contiguous_amax,
cls.global_amax_history_buffer[buffer_key],
cls.global_scale_buffer[buffer_key],
cls.global_scale_inv_buffer[buffer_key],
recipe.amax_compute_algo,
get_fp8_te_dtype(recipe, forward),
recipe.margin,
)
E RuntimeError: /data/users/ybliang/TransformerEngine/transformer_engine/common/recipe/delayed_scaling.cu:430 in function amax_and_scale_update_after_reduction: CUDA Error: operation not supported
/home/ybliang/.conda/envs/pt/lib/python3.10/site-packages/transformer_engine/pytorch/fp8.py:356: RuntimeError
========================================================================================================= short test summary info =========================================================================================================
FAILED TransformerEngine/tests/pytorch/test_float8tensor.py::TestFloat8Tensor::test_fp8_meta - RuntimeError: /data/users/ybliang/TransformerEngine/transformer_engine/common/recipe/delayed_scaling.cu:430 in function amax_and_scale_update_after_reduction: CUDA Error: operation not supported
Interesting, I haven't seen this before when running on L40 or H100. What are you running on? Do the other tests in test_float8tensor.py pass on your system?
@timmoon10 I'm running on H100. The other tests in test_float8tensor.py all passed, however, many failures from other test files, e.g, test_recipe.py(check failure output at here).
This is fixed by upgrading CUDA driver version from 525 to 535.