windows tests report
System Info
OS: Windows10 Python: 3.10 Torch: 2.1.2 GPU: 4060 TI 16GB Cuda: 11.8 bitsandbytes: latest snapshot
Reproduction
this is just a report for current windows support
Expected behavior
This is a test result of
tests\tests_functional.py: 31 failed, 592 passed, 9 skipped in 767.86s (0:12:47)tests\test_autograd.py: 2240 passed, 704 warnings in 119.18s (0:01:59)tests\test_linear4bit.py: 32 passed in 2.90stests\test_linear8bitlt.py: 18 passed in 14.60stests\test_optim.py: system crash after test done. (about 19 error, collected 177 items)
Details
===================================================================================== FAILURES ======================================================================================
_________________________________________ test_nvidia_transform[dims=2-transpose=F-orderOut=col32-orderA=row-int8-dim3=3-dim2=224-dim1=152] _________________________________________
dim1 = 152, dim2 = 224, dim3 = 3, dims = 2, dtype = torch.int8, orderA = 'row', orderOut = 'col32', transpose = False
@pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1"))
@pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2"))
@pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))
@pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype)
@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
@pytest.mark.parametrize("orderOut", ["col", "row", "col32"], ids=id_formatter("orderOut"))
@pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose"))
@pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims"))
def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
if dims == 3 and orderOut != "col32":
return
if dtype == torch.int32 and orderOut != "col32":
return
try:
func = F.get_transform_func(dtype, orderA, orderOut, transpose)
except ValueError as ve:
pytest.skip(str(ve)) # skip if not supported
if dims == 2:
A = torch.randint(-128, 127, size=(dim1, dim2), device="cuda").to(dtype)
elif dims == 3:
A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(
dtype
)
out, S = F.nvidia_transform(A, to_order=orderOut)
if orderOut == "row":
torch.testing.assert_close(A.flatten(), out.flatten())
elif orderOut == "col":
torch.testing.assert_close(A.t().flatten(), out.flatten())
elif orderOut == "col32":
if dims == 2:
n = A.shape[0] * (A.shape[1] + (32 - (A.shape[1] % 32)))
elif dims == 3:
n = (
A.shape[0]
* A.shape[1]
* (A.shape[2] + (32 - (A.shape[2] % 32)))
)
> assert out.numel() == n
E AssertionError: assert 34048 == 38912
E + where 34048 = <built-in method numel of Tensor object at 0x0000000000A48F40>()
E + where <built-in method numel of Tensor object at 0x0000000000A48F40> = tensor([[ -32, -70, 98, 119, -80, 66, -11, 30, -63, -51, 47, -22, 100, -78, 32, 39, -71, 17,\n -20, 94, ..., 64, 115, 73, 109, 95, -71, 100, 40, 113, 75, -54, 58, -86, 20, -37,\n -120, -79, 88, -12, -68],\n [ 68, 75, -66, 23, 7, -45, 86, -8, -50, -24, 48, 55, 18, -97, -61, 64, -43, 70,\n -67, -4, ..., -116, 67, 27, -97, 36, 124, -122, -93, 22, -3, -13, 7, -38, 70, 104,\n 120, -64, 90, 83, 49],\n [ -5, 97, 31, 123, -101, -69, -36, -13, 15, 96, -11, -12, 107, -57, 48, -122, -64, -94,\n 43, 106, ..., -5, 41, 124, 41, 27, 104, 84, 89, 21, 118, -64, -112, 83, -12, -87,\n 19, -13, -47, 11, -17],\n [ 123, -126, 29, -88, 45, 1, 125, -124, 111, -40, -60, -83, -49, -44, -18, -111, 84, 63,\n 59, -76, ..., -60, 63, -120, -48, -52, -78, -99, 17, 59, -17, 75, -37, -60, -37, 52,\n 68, -107, -39, -54, 88],\n [ -69, -64, -86, -60, 88, -84, 44, 30, 6, 35, 12, 21, 104, 39, 122, 10...32, 74, 84, 102, -61,\n 105, -39, 18, -53, 39],\n [ -91, -124, 108, 77, -19, -14, 115, 75, -90, 123, -68, -117, -91, -71, -59, -59, -121, 97,\n 97, -34, ..., 62, 47, -78, 40, 97, 41, 83, 84, 25, 121, -52, 11, 104, -78, 92,\n -23, 10, 81, -34, 39],\n [-128, -36, -72, -57, -41, 36, -69, 76, 103, -87, -39, -84, 87, -91, 103, 30, 114, 122,\n -19, 51, ..., 95, 65, 83, 71, -11, 111, 4, 74, 50, -82, 89, -79, -67, 104, -71,\n 71, -69, 1, -84, 72],\n [ 79, 29, 74, 96, 19, -30, 2, -73, 78, 75, -58, -24, -25, 63, 28, 63, -14, -128,\n -15, -68, ..., 113, -114, -48, 75, 107, 3, 30, 61, -50, -103, -25, 61, -4, -18, -27,\n -26, -47, -2, -19, -7],\n [ -19, -34, 119, 39, 12, 1, -31, 88, -9, -18, -115, -13, -62, 44, 117, -7, 10, 53,\n -64, -97, ..., -104, -82, 111, -44, 40, -100, -114, 30, -72, 45, 117, 16, 29, -122, 2,\n 22, -102, -46, 9, 48]], device='cuda:0', dtype=torch.int8).numel
tests\test_functional.py:587: AssertionError
_________________________________________ test_nvidia_transform[dims=2-transpose=F-orderOut=col32-orderA=row-int8-dim3=3-dim2=224-dim1=73] __________________________________________
dim1 = 73, dim2 = 224, dim3 = 3, dims = 2, dtype = torch.int8, orderA = 'row', orderOut = 'col32', transpose = False
@pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1"))
@pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2"))
@pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3"))
@pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype)
@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
@pytest.mark.parametrize("orderOut", ["col", "row", "col32"], ids=id_formatter("orderOut"))
@pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose"))
@pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims"))
def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
if dims == 3 and orderOut != "col32":
return
if dtype == torch.int32 and orderOut != "col32":
return
try:
func = F.get_transform_func(dtype, orderA, orderOut, transpose)
except ValueError as ve:
pytest.skip(str(ve)) # skip if not supported
if dims == 2:
A = torch.randint(-128, 127, size=(dim1, dim2), device="cuda").to(dtype)
elif dims == 3:
A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(
dtype
)
out, S = F.nvidia_transform(A, to_order=orderOut)
if orderOut == "row":
torch.testing.assert_close(A.flatten(), out.flatten())
elif orderOut == "col":
torch.testing.assert_close(A.t().flatten(), out.flatten())
elif orderOut == "col32":
if dims == 2:
n = A.shape[0] * (A.shape[1] + (32 - (A.shape[1] % 32)))
elif dims == 3:
n = (
A.shape[0]
* A.shape[1]
* (A.shape[2] + (32 - (A.shape[2] % 32)))
)
> assert out.numel() == n
E AssertionError: assert 16352 == 18688
E + where 16352 = <built-in method numel of Tensor object at 0x000000001E9A7A60>()
E + where <built-in method numel of Tensor object at 0x000000001E9A7A60> = tensor([[-109, -60, 80, -56, -43, 70, 41, 58, -45, 120, 39, -127, 34, -111, -84, 16, 126, -54,\n -36, 107, ..., 48, 110, -95, -62, 75, 99, -69, -12, -57, -97, 75, -47, 117, 67, -65,\n 39, 8, 108, 76, -49],\n [ 57, -67, -127, -64, 18, 46, 117, 107, 71, -44, 35, -82, 115, -93, -29, -58, 67, -33,\n -87, -57, ..., 45, 30, -58, -46, -11, 13, 96, -100, 124, 122, 12, 107, 27, -23, -126,\n -65, 29, -92, 106, 43],\n [ 124, 46, -65, -74, -92, 12, 71, 51, 75, -19, 70, -41, -49, -108, -28, 37, -117, 66,\n -42, -128, ..., 70, -73, -71, 122, -110, -22, 68, -114, -105, 116, -34, -28, -40, 21, 9,\n 8, -94, 75, -12, -33],\n [ 60, -18, -127, -57, -36, 38, 98, -102, 12, 18, 84, -128, -77, -34, -36, 109, 1, 19,\n -104, -114, ..., 89, 123, 20, -84, -89, 104, 15, 34, 47, -95, -46, 116, -32, -109, -43,\n -15, 21, -79, 57, -24],\n [-124, -59, 78, -41, -37, 75, 61, 77, -123, 59, 88, -53, -44, 4, -75, 4...29, 80, -12, 67, 111,\n 34, 75, -122, -103, -8],\n [-114, 66, 102, -111, 46, 98, -32, 53, -122, 47, -117, 31, 10, -110, 55, -119, -16, 4,\n -35, -82, ..., 115, 97, 9, -67, -9, -41, -105, 103, -18, -20, 44, 8, 8, -64, -25,\n 22, -7, -11, -40, -46],\n [ 111, -64, 46, -106, 65, -123, -32, -61, -40, 111, -99, -70, 20, 54, -125, -89, -44, 123,\n 79, 125, ..., -74, -106, 29, 68, -19, -15, 28, -103, 10, -56, 119, -93, 111, -94, 17,\n -66, 37, 23, 79, 126],\n [-124, -21, -8, 75, -79, -90, -56, -102, -118, 112, 123, -28, -71, 88, 3, 11, -36, -48,\n 108, 28, ..., 50, -32, 5, 73, -92, -55, 60, -128, 63, 113, -18, -40, 15, 35, 5,\n -71, 1, -62, 115, -20],\n [ -74, -81, 120, -51, 48, -124, 89, -96, 2, 80, -45, -94, -36, -90, -25, 69, 93, 116,\n 100, -111, ..., 61, 81, -4, 39, -31, -125, -122, 14, 70, 36, 2, -31, -113, 78, 65,\n -31, -25, 110, 41, 39]], device='cuda:0', dtype=torch.int8).numel
tests\test_functional.py:587: AssertionError
(snip)...
____________________________________________________________________ test_gemv_4bit[uint8-bf16-fc2-nf4-DQ_True] _____________________________________________________________________
dtype = torch.bfloat16, storage_type = 'nf4', quant_storage = torch.uint8, double_quant = True, kind = 'fc2'
@pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
@pytest.mark.parametrize("storage_type", ['nf4', 'fp4'])
@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
for dim in [128, 256, 512, 1024]:
#for dim in [4*1024]:
#for dim in [1*16]:
errs1 = []
errs2 = []
errs3 = []
relerrs1 = []
relerrs2 = []
relerrs3 = []
max_errs1 = []
max_errs2 = []
max_errs3 = []
for i in range(100):
if kind == 'fc1':
A = torch.randn(1, dim, dtype=dtype, device='cuda')
B = torch.randn(dim*4, dim, dtype=dtype, device='cuda')/math.sqrt(dim)
elif kind == 'fc2':
A = torch.randn(1, 4*dim, dtype=dtype, device='cuda')
B = torch.randn(dim, 4*dim, dtype=dtype, device='cuda')/math.sqrt(dim)
elif kind == 'attn':
A = torch.randn(1, dim, dtype=dtype, device='cuda')
B = torch.randn(dim, dim, dtype=dtype, device='cuda')/math.sqrt(dim)
elif kind == 'attn_packed':
A = torch.randn(1, dim, dtype=dtype, device='cuda')
B = torch.randn(dim*3, dim, dtype=dtype, device='cuda')/math.sqrt(dim)
qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=double_quant, quant_storage=quant_storage)
C3 = torch.matmul(A, B.t())
C2 = F.gemv_4bit(A, qB.t(), state=state)
A.requires_grad = True
C1 = bnb.matmul_4bit(A, qB.t(), state)
err1 = (C1-C2).abs().float()
err2 = (C3-C2).abs().float()
err3 = (C3-C1).abs().float()
mag1 = torch.abs(C1).float()+1e-5
mag2 = torch.abs(C3).float()+1e-5
mag3 = torch.abs(C3).float()+1e-5
relerr1 = err1/mag1
relerr2 = err2/mag2
relerr3 = err3/mag3
max_err1 = err1.max()
max_err2 = err2.max()
max_err3 = err3.max()
errs1.append(err1.mean().item())
errs2.append(err2.mean().item())
errs3.append(err3.mean().item())
relerrs1.append(relerr1.mean().item())
relerrs2.append(relerr2.mean().item())
relerrs3.append(relerr3.mean().item())
max_errs1.append(max_err1.item())
max_errs2.append(max_err2.item())
max_errs3.append(max_err3.item())
c = int(C1.numel()*0.0014*(dim/256))+1
c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=False)
err1 = sum(errs1)/len(errs1)/math.sqrt(dim)
err2 = sum(errs2)/len(errs2)/math.sqrt(dim)
err3 = sum(errs3)/len(errs3)/math.sqrt(dim)
relerr1 = sum(relerrs1)/len(relerrs1)/math.sqrt(dim)
relerr2 = sum(relerrs2)/len(relerrs2)/math.sqrt(dim)
relerr3 = sum(relerrs3)/len(relerrs3)/math.sqrt(dim)
maxerr1 = sum(max_errs1)/len(max_errs1)/math.sqrt(dim)
maxerr2 = sum(max_errs2)/len(max_errs2)/math.sqrt(dim)
maxerr3 = sum(max_errs3)/len(max_errs3)/math.sqrt(dim)
absratio = err2/err3
relratio = relerr2/relerr3
maxratio = relerr2/relerr3
# for debugging if the tests fails
#
#print('='*80)
#print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:')
#print(C1.flatten()[-20:])
#print(C2.flatten()[-20:])
#print(f'inference vs training abs: {err1}')
#print(f'inference vs training rel: {relerr1}')
#print(f'inference vs training max: {maxerr1}')
#print(f'inference vs training vs torch err ratio abs: {absratio}')
#print(f'inference vs training vs torch err ratio rel: {relratio}')
#print(f'inference vs training vs torch err ratio max: {maxratio}')
if dtype == torch.float16:
if dim <= 512:
assert err1 < 7e-5
assert relerr1 < 0.0008
else:
assert err1 < 6e-5
assert relerr1 < 2e-4
assert absratio < 1.005 and absratio > 0.995
assert relratio < 1.005 and relratio > 0.995
assert maxratio < 1.005 and maxratio > 0.995
elif dtype == torch.float32:
if dim <= 512:
assert err1 < 5e-8
assert relerr1 < 1e-6
assert maxerr1 < 1e-7
else:
assert err1 < 5e-8
assert relerr1 < 8e-6
assert maxerr1 < 1e-7
assert absratio < 1.005 and absratio > 0.995
assert relratio < 1.005 and relratio > 0.995
assert maxratio < 1.005 and maxratio > 0.995
elif dtype == torch.bfloat16:
if dim <= 512:
assert err1 < 6e-4
assert relerr1 < 0.007
assert maxerr1 < 0.015
else:
assert err1 < 2e-4
> assert relerr1 < 0.002
E assert 0.005869260463805403 < 0.002
tests\test_functional.py:2280: AssertionError
(snip)...
====================================================================================== PASSES =======================================================================================
_____________________________________________________________________________ test_dynamic_quantization _____________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.01197060595266521
0.018862400725483893
0.011960445903241634
0.018855047821998597
_________________________________________________________ test_bench_8bit_training[batch=2, seq=512, model=4k, hidden=12k] __________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.054001569747924805
0.045006752014160156
_________________________________________________________ test_bench_8bit_training[batch=2, seq=512, model=5k, hidden=15k] __________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.07099795341491699
0.07000350952148438
_________________________________________________________ test_bench_8bit_training[batch=2, seq=512, model=12k, hidden=48k] _________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.5150082111358643
0.5169980525970459
___________________________________________________________________________________ test_overflow ___________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
col_ampere
col_ampere
__________________________________________________________________________________ test_spmm_bench __________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
6.008148193359375e-05
0.002994537353515625 0.02800440788269043
0.1069309290901506
6.961822509765625e-05
0.014997482299804688 0.027995824813842773
0.5357042487417286
___________________________________________________________________________________ test_matmuls ____________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.1888427734375 0.1895751953125
0.189697265625 0.189453125
________________________________________________________ test_spmm_coo_very_sparse[out_func=zeros-fp16-dim2=12288-dim1=2048] ________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
________________________________________________________ test_spmm_coo_very_sparse[out_func=ones-fp16-dim2=12288-dim1=2048] _________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
______________________________________________________________________ test_spmm_coo_dequant[dtype0-2048-2048] ______________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
tensor(15., device='cuda:0')
cusparse fp16 0.04900002479553223
int8 0.17600059509277344
int8+dequant 0.20199847221374512
matmul 0.15299677848815918
sparse+ matmul 0.5020043849945068
partial matmul 0.3229959011077881
partial matmul 0.18599939346313477
tensor(15., device='cuda:0')
cusparse fp16 0.04900026321411133
int8 0.14599609375
int8+dequant 0.18900036811828613
matmul 0.15299654006958008
sparse+ matmul 0.5290052890777588
partial matmul 0.3379969596862793
partial matmul 0.1929950714111328
_____________________________________________________________ test_bench_matmul[batch=1, seq=1, model=6656, hidden=26k] _____________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
pytorch fp16: [1,1,6656], [6656,26624]->[1,1,26624]: 1.3130s
bnb nf4: [1,1,6656], [6656,26624]->[1,1,26624]: 0.3770s
bnb nf4+DQ: [1,1,6656], [6656,26624]->[1,1,26624]: 0.4420s
pytorch fp16: [1,1,6656], [6656,26624]->[1,1,26624]: 1.3120s
bnb nf4: [1,1,6656], [6656,26624]->[1,1,26624]: 0.3770s
bnb nf4+DQ: [1,1,6656], [6656,26624]->[1,1,26624]: 0.4410s
__________________________________________________________________________________ test_zeropoint ___________________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
5.278311252593994 0.002829474862664938 1.2293792224227218e-06 6.859274890302913e-07 8.60375803313218e-05 0.000324648164678365
5.207507133483887 0.0028353077359497547 1.2703461607088684e-06 6.862092050141655e-07 8.6198553617578e-05 0.0003371547209098935
_____________________________________________________________________________ test_blockwise_cpu_large ______________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
0.4819974899291992
0.3310065269470215
0.07500696182250977
0.09300756454467773
0.219010591506958
0.2360093593597412
0.07599949836730957
0.1819922924041748
_____________________________________________________________________________ test_bench_dequantization _____________________________________________________________________________
------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------
tensor(255, device='cuda:0', dtype=torch.uint8)
tensor(255, device='cuda:0', dtype=torch.uint8)
=============================================================== 31 failed, 592 passed, 9 skipped in 767.86s (0:12:47) ===============================================================
test_nvidia_transform: 8 failed, 88 passed, 536 deselected in 11.29stest_gemv_4bit: 23 failed, 169 passed, 440 deselected in 615.68s (0:10:15)
The test_nvidia_transform[dims=2-transpose=F-orderOut=col32-orderA=row-int8-dim3=3-dim2=224-dim1=152] tests may well be known failures (that should be skips); see how the test has early exits at the start (that were touched recently in #1000).
As for
tests\test_optim.py: system crash after test done.
this is probably the same thing that manifested as dmesg errors for me on WSL. You might want to try running with -k "not (benchmark or slow)" to skip some heavy tests to see if you get the suite to pass more :)
Thanks a lot for the report @wkpark !
Out of curiosity would you mind also running the transformers integration tests? 🙏
First git clone: https://github.com/huggingface/transformers.git
Then run: RUN_SLOW=1 pytest tests/quantization/bnb/test_4bit.py
set RUN_SLOW=1
(venv) D:\src\transformers>python -m pytest tests\quantization\bnb\test_4bit.py
====================================================================================================== test session starts ======================================================================================================
platform win32 -- Python 3.10.11, pytest-7.4.2, pluggy-1.3.0
rootdir: D:\src\transformers
configfile: pyproject.toml
plugins: anyio-3.7.1, hydra-core-1.3.2, hypothesis-6.93.0, xdist-3.5.0
collected 39 items
tests\quantization\bnb\test_4bit.py ......F.FF....s...FF.FF..FFFFFFFFFFFFFF [100%]
Details
=========================================================================================================== FAILURES ============================================================================================================
________________________________________________________________________________________________ Bnb4BitTest.test_original_dtype ________________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitTest testMethod=test_original_dtype>
def test_original_dtype(self):
r"""
A simple test to check if the model succesfully stores the original dtype
"""
> self.assertTrue(hasattr(self.model_4bit.config, "_pre_quantization_dtype"))
E AssertionError: False is not true
tests\quantization\bnb\test_4bit.py:177: AssertionError
_________________________________________________________________________________________ Bnb4BitTest.test_quantization_num_parameters __________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitTest testMethod=test_quantization_num_parameters>
def test_quantization_num_parameters(self):
r"""
Test if the number of returned parameters is correct
See: https://github.com/huggingface/transformers/issues/25978
"""
num_params_4bit = self.model_4bit.num_parameters()
num_params_fp16 = self.model_fp16.num_parameters()
> self.assertEqual(num_params_4bit, num_params_fp16)
E AssertionError: 1118429184 != 1722408960
tests\quantization\bnb\test_4bit.py:144: AssertionError
__________________________________________________________________________________________________ Bnb4BitTest.test_rwkv_4bit ___________________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitTest testMethod=test_rwkv_4bit>
def test_rwkv_4bit(self):
r"""
A simple test to check if 4-bit RWKV inference works as expected.
"""
model_id = "RWKV/rwkv-4-169m-pile"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
tok = AutoTokenizer.from_pretrained(model_id)
text = "Hello my name is"
input_ids = tok.encode(text, return_tensors="pt").to(0)
> _ = model.generate(input_ids, max_new_tokens=30)
tests\quantization\bnb\test_4bit.py:211:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\_contextlib.py:115: in decorate_context
return func(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\generation\utils.py:1522: in generate
return self.greedy_search(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\generation\utils.py:2339: in greedy_search
outputs = self(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1518: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1527: in _call_impl
return forward_call(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:165: in new_forward
output = old_forward(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\rwkv\modeling_rwkv.py:789: in forward
rwkv_outputs = self.rwkv(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1518: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1527: in _call_impl
return forward_call(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:165: in new_forward
output = old_forward(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\rwkv\modeling_rwkv.py:642: in forward
self._rescale_layers()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = RwkvModel(
(embeddings): Embedding(50277, 768)
(blocks): ModuleList(
(0): RwkvBlock(
(pre_ln): LayerNorm...72, out_features=768, bias=False)
)
)
)
(ln_out): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
def _rescale_layers(self):
# Layers should be rescaled for inference only.
if self.layers_are_rescaled == (not self.training):
return
if self.config.rescale_every > 0:
with torch.no_grad():
for block_id, block in enumerate(self.blocks):
if self.training:
block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
else:
# Deal with quantization statistics
if hasattr(block.attention.output.weight, "SCB"):
block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
elif hasattr(block.attention.output.weight, "quant_state"):
> block.attention.output.weight.quant_state[0].div_(
2 ** int(block_id // self.config.rescale_every)
)
E TypeError: 'QuantState' object is not subscriptable
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\rwkv\modeling_rwkv.py:714: TypeError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
Downloading config.json: 100%|██████████| 521/521 [00:00<?, ?B/s]
Downloading pytorch_model.bin: 100%|██████████| 677M/677M [00:34<00:00, 19.9MB/s]
Downloading generation_config.json: 100%|██████████| 116/116 [00:00<?, ?B/s]
Downloading tokenizer_config.json: 100%|██████████| 264/264 [00:00<00:00, 264kB/s]
Downloading tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 2.84MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 99.0/99.0 [00:00<?, ?B/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
_____________________________________________________________________________________________ Bnb4BitGPT2Test.test_generate_quality _____________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitGPT2Test testMethod=test_generate_quality>
def test_generate_quality(self):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = self.model_4bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
> self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
E AssertionError: 'Hello my name is John Doe. I am a man. I am' not found in {'Hello my name is John.\nI am a friend of your father.\n', 'Hello my name is John Doe, I am a student at the University', 'Hello my name is John and I am a professional photographer. I'}
tests\quantization\bnb\test_4bit.py:222: AssertionError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
_________________________________________________________________________________________ Bnb4BitGPT2Test.test_generate_quality_config __________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitGPT2Test testMethod=test_generate_quality_config>
def test_generate_quality_config(self):
r"""
Test that loading the model with the config is equivalent
"""
bnb_config = BitsAndBytesConfig()
bnb_config.load_in_4bit = True
model_4bit_from_config = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_4bit_from_config.generate(
input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10
)
> self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
E AssertionError: 'Hello my name is John Doe. I am a man. I am' not found in {'Hello my name is John.\nI am a friend of your father.\n', 'Hello my name is John Doe, I am a student at the University', 'Hello my name is John and I am a professional photographer. I'}
tests\quantization\bnb\test_4bit.py:240: AssertionError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
_____________________________________________________________________________________________ Bnb4BitGPT2Test.test_memory_footprint _____________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitGPT2Test testMethod=test_memory_footprint>
def test_memory_footprint(self):
r"""
A simple test to check if the model conversion has been done correctly by checking on the
memory footprint of the converted model and the class type of the linear layers of the converted models
"""
from bitsandbytes.nn import Params4bit
mem_fp16 = self.model_fp16.get_memory_footprint()
mem_4bit = self.model_4bit.get_memory_footprint()
> self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
E AssertionError: 1.0 != 3.3191854854152187 within 7 places (2.3191854854152187 difference)
tests\quantization\bnb\test_4bit.py:169: AssertionError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.
______________________________________________________________________________________________ Bnb4BitGPT2Test.test_original_dtype ______________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitGPT2Test testMethod=test_original_dtype>
def test_original_dtype(self):
r"""
A simple test to check if the model succesfully stores the original dtype
"""
> self.assertTrue(hasattr(self.model_4bit.config, "_pre_quantization_dtype"))
E AssertionError: False is not true
tests\quantization\bnb\test_4bit.py:177: AssertionError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.
________________________________________________________________________________________________ Bnb4BitGPT2Test.test_rwkv_4bit _________________________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitGPT2Test testMethod=test_rwkv_4bit>
def test_rwkv_4bit(self):
r"""
A simple test to check if 4-bit RWKV inference works as expected.
"""
model_id = "RWKV/rwkv-4-169m-pile"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
tok = AutoTokenizer.from_pretrained(model_id)
text = "Hello my name is"
input_ids = tok.encode(text, return_tensors="pt").to(0)
> _ = model.generate(input_ids, max_new_tokens=30)
tests\quantization\bnb\test_4bit.py:211:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\_contextlib.py:115: in decorate_context
return func(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\generation\utils.py:1522: in generate
return self.greedy_search(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\generation\utils.py:2339: in greedy_search
outputs = self(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1518: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1527: in _call_impl
return forward_call(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:165: in new_forward
output = old_forward(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\rwkv\modeling_rwkv.py:789: in forward
rwkv_outputs = self.rwkv(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1518: in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\nn\modules\module.py:1527: in _call_impl
return forward_call(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:165: in new_forward
output = old_forward(*args, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\rwkv\modeling_rwkv.py:642: in forward
self._rescale_layers()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = RwkvModel(
(embeddings): Embedding(50277, 768)
(blocks): ModuleList(
(0): RwkvBlock(
(pre_ln): LayerNorm...72, out_features=768, bias=False)
)
)
)
(ln_out): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
def _rescale_layers(self):
# Layers should be rescaled for inference only.
if self.layers_are_rescaled == (not self.training):
return
if self.config.rescale_every > 0:
with torch.no_grad():
for block_id, block in enumerate(self.blocks):
if self.training:
block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
else:
# Deal with quantization statistics
if hasattr(block.attention.output.weight, "SCB"):
block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
elif hasattr(block.attention.output.weight, "quant_state"):
> block.attention.output.weight.quant_state[0].div_(
2 ** int(block_id // self.config.rescale_every)
)
E TypeError: 'QuantState' object is not subscriptable
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\rwkv\modeling_rwkv.py:714: TypeError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
___________________________________________________________________________________________ BaseSerializationTest.test_serialization ____________________________________________________________________________________________
self = <bnb.test_4bit.BaseSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
> model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
tests\quantization\bnb\test_4bit.py:543:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
Downloading tokenizer_config.json: 100%|██████████| 685/685 [00:00<00:00, 680kB/s]
Downloading config.json: 100%|██████████| 651/651 [00:00<?, ?B/s]
Downloading vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.60MB/s]
Downloading merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 821kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 441/441 [00:00<?, ?B/s]
Downloading pytorch_model.bin: 100%|██████████| 251M/251M [00:04<00:00, 60.3MB/s]
Downloading generation_config.json: 100%|██████████| 137/137 [00:00<?, ?B/s]
________________________________________________________________________________________ ExtendedSerializationTest.test_fp4_double_safe _________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_double_safe>
def test_fp4_double_safe(self):
> self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
tests\quantization\bnb\test_4bit.py:634:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_fp4_double_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_double_unsafe>
def test_fp4_double_unsafe(self):
> self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:631:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
________________________________________________________________________________________ ExtendedSerializationTest.test_fp4_single_safe _________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_single_safe>
def test_fp4_single_safe(self):
> self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=True)
tests\quantization\bnb\test_4bit.py:628:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_fp4_single_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_single_unsafe>
def test_fp4_single_unsafe(self):
> self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:625:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_nf4_double_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_nf4_double_unsafe>
def test_nf4_double_unsafe(self):
> self.test_serialization(quant_type="nf4", double_quant=True, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:620:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
________________________________________________________________________________________ ExtendedSerializationTest.test_nf4_single_safe _________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_nf4_single_safe>
def test_nf4_single_safe(self):
> self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=True)
tests\quantization\bnb\test_4bit.py:617:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_nf4_single_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_nf4_single_unsafe>
def test_nf4_single_unsafe(self):
> self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:614:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_________________________________________________________________________________________ ExtendedSerializationTest.test_serialization __________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
> model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
tests\quantization\bnb\test_4bit.py:543:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
___________________________________________________________________________________________ BloomSerializationTest.test_serialization ___________________________________________________________________________________________
self = <bnb.test_4bit.BloomSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
> model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
tests\quantization\bnb\test_4bit.py:543:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\models\auto\auto_factory.py:484: in from_pretrained
return model_class.from_pretrained(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:2937: in from_pretrained
dispatch_model(model, **kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 43],
[ 50],
[197],
...,
[148],
[117],
[ 35]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
Downloading tokenizer_config.json: 100%|██████████| 222/222 [00:00<?, ?B/s]
Downloading tokenizer.json: 100%|██████████| 14.5M/14.5M [00:00<00:00, 59.6MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 85.0/85.0 [00:00<?, ?B/s]
____________________________________________________________________________________________ GPTSerializationTest.test_serialization ____________________________________________________________________________________________
self = <bnb.test_4bit.GPTSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
with tempfile.TemporaryDirectory() as tmpdirname:
> model_0.save_pretrained(tmpdirname, safe_serialization=safe_serialization)
tests\quantization\bnb\test_4bit.py:550:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = GPT2LMHeadModel(
(transformer): GPT2Model(
(wte): Embedding(50257, 1600)
(wpe): Embedding(1024, 1600)
(d...1600,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)
save_directory = 'C:\\Users\\WK\\AppData\\Local\\Temp\\tmpam_q0sfp', is_main_process = True, state_dict = None, save_function = <function save at 0x00000000134056C0>, push_to_hub = False, max_shard_size = '10GB'
safe_serialization = True, variant = None, kwargs = {}
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
is_main_process: bool = True,
state_dict: Optional[dict] = None,
save_function: Callable = torch.save,
push_to_hub: bool = False,
max_shard_size: Union[int, str] = "10GB",
safe_serialization: bool = False,
variant: Optional[str] = None,
**kwargs,
):
"""
Save a model and its configuration file to a directory, so that it can be re-loaded using the
[`~PreTrainedModel.from_pretrained`] class method.
Arguments:
save_directory (`str` or `os.PathLike`):
Directory to which to save. Will be created if it doesn't exist.
is_main_process (`bool`, *optional*, defaults to `True`):
Whether the process calling this is the main process or not. Useful when in distributed training like
TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
the main process to avoid race conditions.
state_dict (nested dictionary of `torch.Tensor`):
The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
save parts of the model or if special precautions need to be taken when recovering the state dictionary
of a model (like when using model parallelism).
save_function (`Callable`):
The function to use to save the state dictionary. Useful on distributed training like TPUs when one
need to replace `torch.save` by another method.
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace).
max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
<Tip warning={true}>
If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
which will be bigger than `max_shard_size`.
</Tip>
safe_serialization (`bool`, *optional*, defaults to `False`):
Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
variant (`str`, *optional*):
If specified, weights are saved in the format pytorch_model.<variant>.bin.
kwargs:
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
"""
# Checks if the model has been loaded in 8-bit
if getattr(self, "is_loaded_in_8bit", False) and getattr(self, "is_8bit_serializable", False):
warnings.warn(
"You are calling `save_pretrained` to a 8-bit converted model you may likely encounter unexepected"
" behaviors. If you want to save 8-bit models, make sure to have `bitsandbytes>0.37.2` installed.",
UserWarning,
)
if getattr(self, "is_loaded_in_4bit", False):
> raise NotImplementedError(
"You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported"
)
E NotImplementedError: You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\modeling_utils.py:1716: NotImplementedError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.
___________________________________________________________________________________ Bnb4BitTestBasicConfigTest.test_load_in_4_and_8_bit_fails ___________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitTestBasicConfigTest testMethod=test_load_in_4_and_8_bit_fails>
def test_load_in_4_and_8_bit_fails(self):
> with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
E AssertionError: ValueError not raised
tests\quantization\bnb\test_4bit.py:659: AssertionError
_______________________________________________________________________________________ Bnb4BitTestBasicConfigTest.test_set_load_in_8_bit _______________________________________________________________________________________
self = <bnb.test_4bit.Bnb4BitTestBasicConfigTest testMethod=test_set_load_in_8_bit>
def test_set_load_in_8_bit(self):
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
> with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
E AssertionError: ValueError not raised
tests\quantization\bnb\test_4bit.py:664: AssertionError
======================================================================================================= warnings summary ========================================================================================================
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373: PytestConfigWarning: Unknown config option: doctest_glob
self._warn_or_fail_if_strict(f"Unknown config option: {key}\n")
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_generate_quality
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_generate_quality_config
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Pipeline4BitTest::test_pipeline
tests/quantization/bnb/test_4bit.py::Bnb4BitTestTraining::test_training
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:248: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.')
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\cpp_extension.py:28: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
from pkg_resources import packaging # type: ignore[attr-defined]
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\pkg_resources\__init__.py:2868: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
declare_namespace(pkg)
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\cpp_extension.py:383: UserWarning: Error checking compiler version for clang++: Command 'clang++' returned non-zero exit status 1.
warnings.warn(f'Error checking compiler version for {compiler}: {error}')
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
tests/quantization/bnb/test_4bit.py::Bnb4BitGPT2Test::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\generation\utils.py:1259: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)
warnings.warn(
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\transformers\generation\utils.py:1353: UserWarning: Using `max_length`'s default (20) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.
warnings.warn(
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:245: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.')
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
==================================================================================================== short test summary info ====================================================================================================
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_original_dtype - AssertionError: False is not true
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_quantization_num_parameters - AssertionError: 1118429184 != 1722408960
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit - TypeError: 'QuantState' object is not subscriptable
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitGPT2Test::test_generate_quality - AssertionError: 'Hello my name is John Doe. I am a man. I am' not found in {'Hello my name is John.\nI am a friend of your father.\n', 'Hello my name is John Doe, I am a student at the University', 'Hello my name is John ...
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitGPT2Test::test_generate_quality_config - AssertionError: 'Hello my name is John Doe. I am a man. I am' not found in {'Hello my name is John.\nI am a friend of your father.\n', 'Hello my name is John Doe, I am a student at the University', 'Hello my name is John ...
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitGPT2Test::test_memory_footprint - AssertionError: 1.0 != 3.3191854854152187 within 7 places (2.3191854854152187 difference)
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitGPT2Test::test_original_dtype - AssertionError: False is not true
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitGPT2Test::test_rwkv_4bit - TypeError: 'QuantState' object is not subscriptable
FAILED tests/quantization/bnb/test_4bit.py::BaseSerializationTest::test_serialization - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_double_safe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_double_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_single_safe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_single_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_nf4_double_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_nf4_single_safe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_nf4_single_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_serialization - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::BloomSerializationTest::test_serialization - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::GPTSerializationTest::test_serialization - NotImplementedError: You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitTestBasicConfigTest::test_load_in_4_and_8_bit_fails - AssertionError: ValueError not raised
FAILED tests/quantization/bnb/test_4bit.py::Bnb4BitTestBasicConfigTest::test_set_load_in_8_bit - AssertionError: ValueError not raised
=============================================================================== 21 failed, 17 passed, 1 skipped, 19 warnings in 972.20s (0:16:12) ===============================================================================
(venv) D:\src\transformers>
Thanks a lot for running the tests !
Hmmm, I think you might did not installed transformers from source, can you try to build transformers from source ( pip install -e ".[dev]") and re-run the tests? 🙏
(venv) D:\src\transformers>pip show transformers
WARNING: Ignoring invalid distribution -afetensors (f:\webui\webui\stable-diffusion-webui\venv\lib\site-packages)
WARNING: Ignoring invalid distribution -itsandbytes (f:\webui\webui\stable-diffusion-webui\venv\lib\site-packages)
WARNING: Ignoring invalid distribution -orch (f:\webui\webui\stable-diffusion-webui\venv\lib\site-packages)
WARNING: Ignoring invalid distribution -rotobuf (f:\webui\webui\stable-diffusion-webui\venv\lib\site-packages)
Name: transformers
Version: 4.38.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: [email protected]
License: Apache 2.0 License
Location: f:\webui\webui\stable-diffusion-webui\venv\lib\site-packages
Editable project location: D:\src\transformers
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: groundingdino, image-reward, lycoris-lora
(venv) D:\src\transformers>python -m pytest tests\quantization\bnb\test_4bit.py
====================================================================================================== test session starts ======================================================================================================
platform win32 -- Python 3.10.11, pytest-7.4.2, pluggy-1.3.0
rootdir: D:\src\transformers
configfile: pyproject.toml
plugins: anyio-3.7.1, hydra-core-1.3.2, hypothesis-6.93.0, xdist-3.5.0
collected 39 items
tests\quantization\bnb\test_4bit.py ..............s...........FFFFFFFFFFF.. [100%]
Details
=========================================================================================================== FAILURES ============================================================================================================
___________________________________________________________________________________________ BaseSerializationTest.test_serialization ____________________________________________________________________________________________
self = <bnb.test_4bit.BaseSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
> model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
tests\quantization\bnb\test_4bit.py:543:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
________________________________________________________________________________________ ExtendedSerializationTest.test_fp4_double_safe _________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_double_safe>
def test_fp4_double_safe(self):
> self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
tests\quantization\bnb\test_4bit.py:634:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_fp4_double_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_double_unsafe>
def test_fp4_double_unsafe(self):
> self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:631:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
________________________________________________________________________________________ ExtendedSerializationTest.test_fp4_single_safe _________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_single_safe>
def test_fp4_single_safe(self):
> self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=True)
tests\quantization\bnb\test_4bit.py:628:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_fp4_single_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_fp4_single_unsafe>
def test_fp4_single_unsafe(self):
> self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:625:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 65],
[ 26],
[ 70],
...,
[103],
[199],
[167]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_nf4_double_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_nf4_double_unsafe>
def test_nf4_double_unsafe(self):
> self.test_serialization(quant_type="nf4", double_quant=True, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:620:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
________________________________________________________________________________________ ExtendedSerializationTest.test_nf4_single_safe _________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_nf4_single_safe>
def test_nf4_single_safe(self):
> self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=True)
tests\quantization\bnb\test_4bit.py:617:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_______________________________________________________________________________________ ExtendedSerializationTest.test_nf4_single_unsafe ________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_nf4_single_unsafe>
def test_nf4_single_unsafe(self):
> self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=False)
tests\quantization\bnb\test_4bit.py:614:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests\quantization\bnb\test_4bit.py:543: in test_serialization
model_0 = AutoModelForCausalLM.from_pretrained(
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
_________________________________________________________________________________________ ExtendedSerializationTest.test_serialization __________________________________________________________________________________________
self = <bnb.test_4bit.ExtendedSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
> model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
tests\quantization\bnb\test_4bit.py:543:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[199],
[113],
[185],
...,
[138],
[ 74],
[ 26]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
___________________________________________________________________________________________ BloomSerializationTest.test_serialization ___________________________________________________________________________________________
self = <bnb.test_4bit.BloomSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
> model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
tests\quantization\bnb\test_4bit.py:543:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 43],
[ 50],
[197],
...,
[148],
[117],
[ 35]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
____________________________________________________________________________________________ GPTSerializationTest.test_serialization ____________________________________________________________________________________________
self = <bnb.test_4bit.GPTSerializationTest testMethod=test_serialization>, quant_type = 'nf4', double_quant = True, safe_serialization = True
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
> model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
tests\quantization\bnb\test_4bit.py:543:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
src\transformers\models\auto\auto_factory.py:567: in from_pretrained
return model_class.from_pretrained(
src\transformers\modeling_utils.py:3560: in from_pretrained
dispatch_model(model, **device_map_kwargs)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\big_modeling.py:371: in dispatch_model
attach_align_device_hook_on_blocks(
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:506: in attach_align_device_hook_on_blocks
add_hook_to_module(module, hook)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:155: in add_hook_to_module
module = hook.init_hook(module)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\hooks.py:253: in init_hook
set_module_tensor_to_device(module, name, self.execution_device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\accelerate\utils\modeling.py:320: in set_module_tensor_to_device
new_value = param_cls(new_value, requires_grad=old_value.requires_grad).to(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:211: in to
return self._quantize(device)
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:183: in _quantize
w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
A = tensor([[ 87],
[234],
[ 84],
...,
[234],
[135],
[175]], device='cuda:0', dtype=torch.uint8), absmax = tensor([0., 0., 0., ..., 0., 0., 0.], device='cuda:0')
out = tensor([[0],
[0],
[0],
...,
[0],
[0],
[0]], device='cuda:0', dtype=torch.uint8), blocksize = 64, compress_statistics = True, quant_type = 'fp4', quant_storage = torch.uint8
def quantize_4bit(
A: Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
blocksize=64,
compress_statistics=False,
quant_type='fp4',
quant_storage=torch.uint8,
) -> Tuple[Tensor, QuantState]:
"""
Quantize tensor A in blocks of 4-bit values.
Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
Parameters
----------
A : torch.Tensor
The input tensor.
absmax : torch.Tensor
The absmax values.
out : torch.Tensor
The output tensor.
blocksize : int
The blocksize used in quantization.
quant_type : str
The 4-bit quantization data type {fp4, nf4}
Returns
-------
torch.Tensor:
Tensor with packed 4-bit values.
tuple(torch.Tensor, torch.Size, torch.dtype, int):
The quantization state to undo the quantization.
"""
if A.device.type != 'cuda':
raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}')
if quant_type not in ['fp4', 'nf4']:
raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.')
n = A.numel()
input_shape = A.shape
if absmax is None:
blocks = n // blocksize
blocks += 1 if n % blocksize > 0 else 0
absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
if out is None:
mod = dtype2bytes[quant_storage] * 2
out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device)
assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
prev_device = pre_call(A.device)
is_on_gpu([A, out, absmax])
if A.dtype == torch.float32:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.float16:
if quant_type == 'fp4':
lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
elif A.dtype == torch.bfloat16:
if quant_type == 'fp4':
lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n))
else:
> raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
E ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\functional.py:994: ValueError
======================================================================================================= warnings summary ========================================================================================================
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373: PytestConfigWarning: Unknown config option: doctest_glob
self._warn_or_fail_if_strict(f"Unknown config option: {key}\n")
tests/quantization/bnb/test_4bit.py: 10 warnings
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:248: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.')
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
return self.fget.__get__(instance, owner)()
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\cpp_extension.py:28: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
from pkg_resources import packaging # type: ignore[attr-defined]
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\pkg_resources\__init__.py:2868: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
declare_namespace(pkg)
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\cpp_extension.py:383: UserWarning: Error checking compiler version for clang++: Command 'clang++' returned non-zero exit status 1.
warnings.warn(f'Error checking compiler version for {compiler}: {error}')
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
D:\src\transformers\src\transformers\generation\utils.py:1133: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
warnings.warn(
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:245: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.')
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
==================================================================================================== short test summary info ====================================================================================================
FAILED tests/quantization/bnb/test_4bit.py::BaseSerializationTest::test_serialization - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_double_safe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_double_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_single_safe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_fp4_single_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_nf4_double_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_nf4_single_safe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_nf4_single_unsafe - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_serialization - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::BloomSerializationTest::test_serialization - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
FAILED tests/quantization/bnb/test_4bit.py::GPTSerializationTest::test_serialization - ValueError: Blockwise quantization only supports 16/32-bit floats, but got torch.uint8
=============================================================================== 11 failed, 27 passed, 1 skipped, 22 warnings in 423.85s (0:07:03) ===============================================================================
Interesting, the great news is that only the serialization tests are failing, can you try to update accelerate ? pip install -U accelerate this might fix the failing tests
Can you in addition to that run the 8bit tests? 🙏 RUN_SLOW=1 pytest tests/quantization/bnb/test_mixed_int8.py
after updating accelerate, test_serialization test was successfully passed!
(venv) >pip show accelerate
Name: accelerate
Version: 0.26.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: [email protected]
License: Apache
Location: f:\webui\webui\stable-diffusion-webui\venv\lib\site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: image-reward
(venv) D:\src\transformers>python -m pytest tests\quantization\bnb\test_4bit.py -k "test_serialization"
====================================================================================================== test session starts ======================================================================================================
platform win32 -- Python 3.10.11, pytest-7.4.2, pluggy-1.3.0
rootdir: D:\src\transformers
configfile: pyproject.toml
plugins: anyio-3.7.1, hydra-core-1.3.2, hypothesis-6.93.0, xdist-3.5.0
collected 39 items / 35 deselected / 4 selected
tests\quantization\bnb\test_4bit.py .... [100%]
======================================================================================================= warnings summary ========================================================================================================
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373: PytestConfigWarning: Unknown config option: doctest_glob
self._warn_or_fail_if_strict(f"Unknown config option: {key}\n")
tests/quantization/bnb/test_4bit.py::BaseSerializationTest::test_serialization
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
return self.fget.__get__(instance, owner)()
tests/quantization/bnb/test_4bit.py::BaseSerializationTest::test_serialization
tests/quantization/bnb/test_4bit.py::ExtendedSerializationTest::test_serialization
tests/quantization/bnb/test_4bit.py::BloomSerializationTest::test_serialization
tests/quantization/bnb/test_4bit.py::GPTSerializationTest::test_serialization
D:\src\transformers\src\transformers\quantizers\auto.py:147: UserWarning: You passed `quantization_config` or equivalent parameters to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` from the model will be prevail.
warnings.warn(warning_msg)
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
========================================================================================= 4 passed, 35 deselected, 6 warnings in 46.51s =========================================================================================
test again test_4bit.py
(venv) D:\src\transformers>python -m pytest tests\quantization\bnb\test_4bit.py
(venv) D:\src\transformers>python -m pytest tests\quantization\bnb\test_4bit.py
====================================================================================================== test session starts ======================================================================================================
platform win32 -- Python 3.10.11, pytest-7.4.2, pluggy-1.3.0
rootdir: D:\src\transformers
configfile: pyproject.toml
plugins: anyio-3.7.1, hydra-core-1.3.2, hypothesis-6.93.0, xdist-3.5.0
collected 39 items
tests\quantization\bnb\test_4bit.py ..............s........................ [100%]
Details
======================================================================================================= warnings summary ========================================================================================================
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\_pytest\config\__init__.py:1373: PytestConfigWarning: Unknown config option: doctest_glob
self._warn_or_fail_if_strict(f"Unknown config option: {key}\n")
tests/quantization/bnb/test_4bit.py: 10 warnings
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:248: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.')
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
return self.fget.__get__(instance, owner)()
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\cpp_extension.py:28: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
from pkg_resources import packaging # type: ignore[attr-defined]
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\pkg_resources\__init__.py:2868: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('google')`.
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
declare_namespace(pkg)
tests/quantization/bnb/test_4bit.py::Bnb4BitTest::test_rwkv_4bit
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\torch\utils\cpp_extension.py:383: UserWarning: Error checking compiler version for clang++: Command 'clang++' returned non-zero exit status 1.
warnings.warn(f'Error checking compiler version for {compiler}: {error}')
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
D:\src\transformers\src\transformers\generation\utils.py:1133: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
warnings.warn(
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_with_keep_in_fp32
tests/quantization/bnb/test_4bit.py::Bnb4BitT5Test::test_inference_without_keep_in_fp32
F:\webui\webui\stable-diffusion-webui\venv\lib\site-packages\bitsandbytes\nn\modules.py:245: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.
warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.')
tests/quantization/bnb/test_4bit.py: 11 warnings
D:\src\transformers\src\transformers\quantizers\auto.py:147: UserWarning: You passed `quantization_config` or equivalent parameters to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` from the model will be prevail.
warnings.warn(warning_msg)
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
==================================================================================== 38 passed, 1 skipped, 33 warnings in 441.37s (0:07:21) =====================================================================================
all tests passed for 4bit!!😎
short test summary info for mixed_int8 test
(venv) D:\src\transformers>python -m pytest tests\quantization\bnb\test_mixed_int8.py
====================================================================================================== test session starts ======================================================================================================
platform win32 -- Python 3.10.11, pytest-7.4.2, pluggy-1.3.0
rootdir: D:\src\transformers
configfile: pyproject.toml
plugins: anyio-3.7.1, hydra-core-1.3.2, hypothesis-6.93.0, xdist-3.5.0
collected 43 items
tests\quantization\bnb\test_mixed_int8.py .....................sssss...FF..FFFF......
(snip)...
=========================================================================================================== FAILURES ============================================================================================================
____________________________________________________________________________________________ MixedInt8GPT2Test.test_generate_quality ____________________________________________________________________________________________
self = <bnb.test_mixed_int8.MixedInt8GPT2Test testMethod=test_generate_quality>
def test_generate_quality(self):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = self.model_8bit.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
> self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
E AssertionError: 'Hello my name is John Doe, and I am a member of the' not found in {"Hello my name is John Doe, and I'm a big fan of", "Hello my name is John Doe, and I'm a fan of the"}
tests\quantization\bnb\test_mixed_int8.py:264: AssertionError
----------------------------------------------------------------------------------------------------- Captured stderr call ------------------------------------------------------------------------------------------------------
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
(snip)...
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
==================================================================================================== short test summary info ====================================================================================================
FAILED tests/quantization/bnb/test_mixed_int8.py::MixedInt8GPT2Test::test_generate_quality - AssertionError: 'Hello my name is John Doe, and I am a member of the' not found in {"Hello my name is John Doe, and I'm a big fan of", "Hello my name is John Doe, and I'm a fan of the"}
FAILED tests/quantization/bnb/test_mixed_int8.py::MixedInt8GPT2Test::test_generate_quality_config - AssertionError: 'Hello my name is John Doe, and I am a member of the' not found in {"Hello my name is John Doe, and I'm a big fan of", "Hello my name is John Doe, and I'm a fan of the"}
FAILED tests/quantization/bnb/test_mixed_int8.py::MixedInt8GPT2Test::test_int8_from_pretrained - AssertionError: 'Hello my name is John Doe, and I am a member of the' not found in {"Hello my name is John Doe, and I'm a big fan of", "Hello my name is John Doe, and I'm a fan of the"}
FAILED tests/quantization/bnb/test_mixed_int8.py::MixedInt8GPT2Test::test_int8_serialization - AssertionError: 'Hello my name is John Doe, and I am a member of the' not found in {"Hello my name is John Doe, and I'm a big fan of", "Hello my name is John Doe, and I'm a fan of the"}
FAILED tests/quantization/bnb/test_mixed_int8.py::MixedInt8GPT2Test::test_int8_serialization_regression - AssertionError: 'Hello my name is John Doe, and I am a member of the' not found in {"Hello my name is John Doe, and I'm a big fan of", "Hello my name is John Doe, and I'm a fan of the"}
FAILED tests/quantization/bnb/test_mixed_int8.py::MixedInt8GPT2Test::test_int8_serialization_sharded - AssertionError: 'Hello my name is John Doe, and I am a member of the' not found in {"Hello my name is John Doe, and I'm a big fan of", "Hello my name is John Doe, and I'm a fan of the"}
=============================================================================== 6 failed, 32 passed, 5 skipped, 19 warnings in 720.00s (0:11:59) ================================================================================
AMAZING @wkpark ! 🎉 For the 8bit tests the quality tests are expected to not pass, don't worry about them
Can you in addition to that run the 8bit tests? 🙏
RUN_SLOW=1 pytest tests/quantization/bnb/test_mixed_int8.py
For the record, RUN_SLOW is not a thing for this repository – I added Pytest marks for that. Slow tests are run by default, but you can opt-out via -k "not slow".
@akx thanks ! I meant for the transformers repository not for the slow tests in bnb repository (I think you meant here the slow tests for bnb no?)
I was able to build with CUDA 12.0 and run the tests on Windows.
Hardware: CPU: i7-12700H GPU: RTX 3060 Mobile
Software: OS: Windows 11 MSVC: 19.38.33134 (VC++ Toolset 14.38.33130) CMake: 3.27.2-msvc1 CUDA Toolkit: 12.0.140 NVIDIA Driver: 546.65 Python: 3.11.6 PyTorch: 2.2.0+cu121 Transformers: 4.37.2
Build configuration:
CMAKE_BUILD_TYPE=Release
BUILD_CUDA=ON
NO_CUBLASLT=OFF
CUDA_VERSION=120
COMPUTE_CAPABILITY=50;52;53;60;61;62;70;72;75;80;86;87;89;90
PTXAS_VERBOSE=OFF
I've observed the same crash on the tests in test_optim. When I skip those tests and the benchmark/slow ones, here is my result:
2901 passed
24 failed
- tests/test_functional.py:533 test_vector_quant[dim3=56-dim2=80-dim1=12]
- tests/test_functional.py:2155 test_gemv_4bit[uint8-bf16-fc2-nf4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[uint8-bf16-fc2-nf4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[uint8-bf16-fc2-fp4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[uint8-bf16-fc2-fp4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[fp16-fp16-fc2-fp4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[fp16-bf16-fc2-nf4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[fp16-bf16-fc2-nf4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[fp16-bf16-fc2-fp4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[fp16-bf16-fc2-fp4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-fp16-fc2-nf4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-fp16-fc2-fp4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-fp16-attn-nf4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-bf16-fc2-nf4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-bf16-fc2-nf4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-bf16-fc2-fp4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-bf16-fc2-fp4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[bf16-bf16-attn-nf4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[fp32-fp16-fc2-nf4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[fp32-fp16-fc2-fp4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[fp32-bf16-fc2-nf4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[fp32-bf16-fc2-nf4-DQ_False]
- tests/test_functional.py:2155 test_gemv_4bit[fp32-bf16-fc2-fp4-DQ_True]
- tests/test_functional.py:2155 test_gemv_4bit[fp32-bf16-fc2-fp4-DQ_False]
9 skipped
25 deselected
As for the optimizer tests, these complete with 3 failures and 2 skips prior to a crash:
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp32-opt=adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp32-opt=momentum] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp32-opt=rmsprop] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp32-opt=paged_adamw] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp32-opt=paged_adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp32-opt=lion] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp32-opt=paged_lion] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp16-opt=adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp16-opt=momentum] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp16-opt=rmsprop] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp16-opt=paged_adamw] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp16-opt=paged_adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp16-opt=lion] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-fp16-opt=paged_lion] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-bf16-opt=adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-bf16-opt=momentum] s
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-bf16-opt=rmsprop] s
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-bf16-opt=paged_adamw] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-bf16-opt=paged_adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-bf16-opt=lion] ✓
tests\test_optim.py::test_optimizer32bit[dim2=32-dim1=1024-bf16-opt=paged_lion] ✓
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp32-opt=adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp32-opt=momentum] ✓
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp32-opt=rmsprop] ✓
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp32-opt=paged_adamw] ⨯
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp32-opt=paged_adam] ⨯
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp32-opt=lion] ✓
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp32-opt=paged_lion] ⨯
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp16-opt=adam] ✓
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp16-opt=momentum] ✓
tests\test_optim.py::test_optimizer32bit[dim2=1024-dim1=1024-fp16-opt=rmsprop] ✓
Are those tests only failing due to slight deviations from the tolerances? If this is the case, then this is expected due to the unfortunately quite flaky tests (something we'll work on fixing soon).
In that case, we could close this issue and be super happy that this whole Windows journey went so well! Thanks again to anyone involved, especially @wkpark and @matthewdouglas ❤️
@Titus-von-Koeller Yes, the failures were related to some tolerances and the stochastic nature of some of the tests. I get similar results on my Linux machine.
I do think the crash on the 32bit optimizer tests was related to the 6GB vRAM that I have on my Windows machine. It seems those tests need closer to ~12GB to run. Stabilizing these tests is a good separate issue across platform, but I think we're good closing this one.