refact
refact copied to clipboard
Handle OOM better on smaller/older GPUs, or bigger models on regular GPUs
downloads-refact-1 | -- 4522 -- FILTER explanation: initial loss too big calculated on a single file, threshold is 3.500. Likely
downloads-refact-1 | -- 4522 -- means the file doesn't contain code.
downloads-refact-1 | -- 4522 -- Reading /perm_storage/cfg/sources_filetypes.cfg
downloads-refact-1 | -- 4522 -- 20230930 23:34:53 FTUNE STATUS working
downloads-refact-1 | overwrite /perm_storage/cfg/finetune_status.out with prog=prog_filter status=working
downloads-refact-1 | -- 4522 --
downloads-refact-1 | -- 4522 -- 90cd71def5c2 Caught exception:
downloads-refact-1 | -- 4522 -- Traceback (most recent call last):
downloads-refact-1 | -- 4522 -- File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
downloads-refact-1 | -- 4522 -- return _run_code(code, main_globals, None,
downloads-refact-1 | -- 4522 -- File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
downloads-refact-1 | -- 4522 -- exec(code, run_globals)
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/refact_enterprise/finetune/finetune_filter.py", line 14, in <module>
downloads-refact-1 | -- 4522 -- main(models_mini_db)
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/refact_data_pipeline/finetune/finetune_filter.py", line 288, in main
downloads-refact-1 | -- 4522 -- raise e
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/refact_data_pipeline/finetune/finetune_filter.py", line 273, in main
downloads-refact-1 | -- 4522 -- pre_filtering(stats_dict, models_db)
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/refact_data_pipeline/finetune/finetune_filter.py", line 209, in pre_filtering
downloads-refact-1 | -- 4522 -- filtered = loss_based_filter(
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
downloads-refact-1 | -- 4522 -- return func(*args, **kwargs)
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/refact_data_pipeline/finetune/finetune_filter.py", line 108, in loss_based_filter
downloads-refact-1 | -- 4522 -- logits = forward(input=batch['input'])
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/refact_data_pipeline/finetune/model_handling.py", line 163, in model_forward
downloads-refact-1 | -- 4522 -- logits = model.forward(
downloads-refact-1 | -- 4522 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 548, in forward
downloads-refact-1 | -- 4522 -- transformer_outputs = self.transformer(
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
downloads-refact-1 | -- 4522 -- return forward_call(*args, **kwargs)
downloads-refact-1 | -- 4522 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 459, in forward
-- 4522 -- outputs = block(
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
downloads-refact-1 | -- 4522 -- return forward_call(*args, **kwargs)
downloads-refact-1 | -- 4522 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 278, in forward
downloads-refact-1 | -- 4522 -- attn_outputs = self.attn(
downloads-refact-1 | -- 4522 -- File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
downloads-refact-1 | -- 4522 -- return forward_call(*args, **kwargs)
downloads-refact-1 | -- 4522 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 214, in forward
downloads-refact-1 | -- 4522 -- attn_output, attn_weights = self._attn(query, key.transpose(-1, -2), value, attention_mask, alibi)
downloads-refact-1 | -- 4522 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 177, in _attn
downloads-refact-1 | -- 4522 -- attn_weights = upcast_masked_softmax(attn_weights, attention_mask, mask_value, softmax_dtype)
downloads-refact-1 | -- 4522 -- RuntimeError: The following operation failed in the TorchScript interpreter.
downloads-refact-1 | -- 4522 -- Traceback of TorchScript (most recent call last):
downloads-refact-1 | -- 4522 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 28, in upcast_masked_softmax
downloads-refact-1 | -- 4522 -- input_dtype = x.dtype
downloads-refact-1 | -- 4522 -- x = x.to(softmax_dtype)
downloads-refact-1 | -- 4522 -- x = torch.where(mask, x, mask_value)
downloads-refact-1 | -- 4522 -- ~~~~~~~~~~~ <--- HERE
downloads-refact-1 | -- 4522 -- x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
downloads-refact-1 | -- 4522 -- return x
downloads-refact-1 | -- 4522 -- RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB (GPU 0; 10.75 GiB total capacity; 10.04 GiB already allocated; 298.69 MiB free; 10.27 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
downloads-refact-1 | -- 4522 --
downloads-refact-1 | -- 4522 --
downloads-refact-1 | -- 30 -- 20230930 23:34:54 WEBUI 172.31.2.1:33580 - "GET /tab-finetune-get HTTP/1.1" 200
downloads-refact-1 | -- 30 -- 20230930 23:34:54 WEBUI 172.31.2.1:34256 - "GET /tab-finetune-config-and-runs HTTP/1.1" 200
downloads-refact-1 | 20230930 23:34:55 4522 finished python -m refact_enterprise.finetune.finetune_sequence --filter-only @:gpu00, retcode 1
downloads-refact-1 | overwrite /perm_storage/cfg/finetune_status.out with prog=prog_filter status=failed
Workaround: change tokens parameter from 4096 to 2048 for Refact/1.6B
We can do a monkey patch to the finetune config using gpu runtime information. Or we can simply add another config for Refact 1.6
with lower context size for those "older" gpu's without flash attention support
@olegklimov
I was having the same error, I set the token limit to 2048 but now get another cuda error when running the Filter step:
refact | -- 461 -- /root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py:177: UserWarning: FALLBACK path has been taken inside: runCudaFusionGroup. This is an indication that codegen Failed for some reason.
refact | -- 461 -- To debug try disable codegen fallback path via setting the env variable `export PYTORCH_NVFUSER_DISABLE=fallback`
refact | -- 461 -- (Triggered internally at ../third_party/nvfuser/csrc/manager.cpp:335.)
refact | -- 461 -- attn_weights = upcast_masked_softmax(attn_weights, attention_mask, mask_value, softmax_dtype)
refact | -- 461 -- 20231010 12:32:23 FTUNE FAILED: The following operation failed in the TorchScript interpreter.
refact | -- 461 -- Traceback of TorchScript (most recent call last):
refact | -- 461 -- RuntimeError: The following operation failed in the TorchScript interpreter.
refact | -- 461 -- Traceback of TorchScript (most recent call last):
refact | -- 461 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 27, in fallback_cuda_fuser
refact | -- 461 -- ):
refact | -- 461 -- input_dtype = x.dtype
refact | -- 461 -- x = x.to(softmax_dtype)
refact | -- 461 -- ~~~~ <--- HERE
refact | -- 461 -- x = torch.where(mask, x, mask_value)
refact | -- 461 -- x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
refact | -- 461 -- RuntimeError: CUDA error: the launch timed out and was terminated
refact | -- 461 -- CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
refact | -- 461 -- For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
refact | -- 461 -- Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
refact | -- 461 --
refact | -- 461 --
refact | -- 461 --
refact | -- 461 -- FAILED: The following operation failed in the TorchScript interpreter.
refact | -- 461 -- Traceback of TorchScript (most recent call last):
refact | -- 461 -- RuntimeError: The following operation failed in the TorchScript interpreter.
refact | -- 461 -- Traceback of TorchScript (most recent call last):
refact | -- 461 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 27, in fallback_cuda_fuser
refact | -- 461 -- ):
refact | -- 461 -- input_dtype = x.dtype
refact | -- 461 -- x = x.to(softmax_dtype)
refact | -- 461 -- ~~~~ <--- HERE
refact | -- 461 -- x = torch.where(mask, x, mask_value)
refact | -- 461 -- x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
refact | -- 461 -- RuntimeError: CUDA error: the launch timed out and was terminated
refact | -- 461 -- CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
refact | -- 461 -- For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
refact | -- 461 -- Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
How much vram is actually needed for fine tuning 1.6B model?
It doesn't say "out of memory" for you. 🤔 Not sure how to debug this. @bonswouar what GPU do you have?
It doesn't say "out of memory" for you. 🤔 Not sure how to debug this.
I've just tried on Linux to see if output is any different (I noticed the model seems much faster to load btw), this time I always get (with 2048 tokens also):
refact | -- 155 -- 20231016 10:23:51 FTUNE FAILED: The following operation failed in the TorchScript interpreter.
refact | -- 155 -- Traceback of TorchScript (most recent call last):
refact | -- 155 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 102, in get_alibi_biases
refact | -- 155 --
refact | -- 155 -- # Multiply them pair-wise to get the AliBi bias matrix
refact | -- 155 -- biases = distance[:, :, None] * m[None, None, :]
refact | -- 155 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
refact | -- 155 -- biases = biases.permute(2, 0, 1)[None, :, :T, :T]
refact | -- 155 -- return biases.contiguous()
refact | -- 155 -- RuntimeError: CUDA out of memory. Tried to allocate 512.00 MiB (GPU 0; 3.94 GiB total capacity; 3.00 GiB already allocated; 53.38 MiB free; 3.16 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
refact | -- 155 --
refact | -- 155 -- FAILED: The following operation failed in the TorchScript interpreter.
refact | -- 155 -- Traceback of TorchScript (most recent call last):
refact | -- 155 -- File "/root/.cache/huggingface/modules/transformers_modules/smallcloudai/Refact-1_6B-fim/acc9591f69aae4d950d58d372aa6c8b34543fd2c/modeling_gpt_refact.py", line 102, in get_alibi_biases
refact | -- 155 --
refact | -- 155 -- # Multiply them pair-wise to get the AliBi bias matrix
refact | -- 155 -- biases = distance[:, :, None] * m[None, None, :]
refact | -- 155 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
refact | -- 155 -- biases = biases.permute(2, 0, 1)[None, :, :T, :T]
refact | -- 155 -- return biases.contiguous()
refact | -- 155 -- RuntimeError: CUDA out of memory. Tried to allocate 512.00 MiB (GPU 0; 3.94 GiB total capacity; 3.00 GiB already allocated; 53.38 MiB free; 3.16 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
what GPU do you have?
Only a GTX 970. I was hopping this would be enough as I can ran 7B quantized models, but I guess I was a bit optimistic :)