DeepSpeed
DeepSpeed copied to clipboard
[BUG] Zeros in returned k_v cache when get_present = True
Describe the bug
Running a forward pass with get_present = True
returns wrong values for key and value tensor. They contain a lot of zeros.
To Reproduce Steps to reproduce the behavior:
- Simple inference script to reproduce
import os
import time
import deepspeed
import torch
import torch.nn as nn
from deepspeed.module_inject.replace_policy import TransformerPolicy, DSPolicy
from torch.nn import Parameter,LayerNorm
class TransformerInferenceCustomPolicy(TransformerPolicy):
_orig_layer_class = None
def __init__(self, client_module,inference):
super().__init__(inference)
self.client_module = client_module
if TransformerInferenceCustomPolicy._orig_layer_class is None:
TransformerInferenceCustomPolicy._orig_layer_class = DeepSpeedTransformerInferenceCustomLayer
pass
def get_hidden_heads(self):
return self.client_module.attention.query_key_value.weight.shape[1], \
self.client_module.attention.heads
def attention(self):
qkvw = self.client_module.attention.query_key_value.weight
qkvb = self.client_module.attention.query_key_value.bias
return self.linear_layer, \
qkvw, \
qkvb, \
self.client_module.attention.dense.weight, \
self.client_module.attention.dense.bias, \
self.scale_attention, \
self.is_megatron_v2
def mlp(self):
return self.linear_layer, \
self.client_module.mlp.dense_to_4h.weight,\
self.client_module.mlp.dense_to_4h.bias, \
self.client_module.mlp.dense_to_h.weight, \
self.client_module.mlp.dense_to_h.bias
def layerNorm(self):
attention_layernorm = self.client_module.post_attention_layernorm
transformer_layernorm = self.client_module.input_layernorm
return attention_layernorm.weight, \
attention_layernorm.bias, \
transformer_layernorm.weight, \
transformer_layernorm.bias
class WeightBiasContainer(torch.nn.Module):
# Wrapper for a param containing weight and bias
def __init__(self, weight, bias):
super().__init__()
self.weight = weight
self.bias = bias
class DeepSpeedMLPCustom(nn.Module):
def __init__(self, hidden_size, intermediate_multiplier=4, layernorm_epsilon=1e-5):
super().__init__()
intermediate_size = hidden_size * intermediate_multiplier
inter_w = nn.Parameter(
torch.Tensor(intermediate_size,hidden_size
))
inter_b = nn.Parameter(
torch.Tensor(intermediate_size ))
output_w = nn.Parameter(
torch.Tensor(hidden_size,intermediate_size))
output_b = nn.Parameter(torch.Tensor(hidden_size))
self.dense_to_4h = WeightBiasContainer(inter_w, inter_b)
self.dense_to_h = WeightBiasContainer(output_w, output_b)
class DeepSpeedSelfAttentionCustom(nn.Module):
def __init__(self,
hidden_size,
heads,
qkv_merging=False):
super(DeepSpeedSelfAttentionCustom, self).__init__()
self.heads = heads
attn_qkvw = nn.Parameter(
torch.Tensor(hidden_size * 3,
hidden_size))
attn_qkvb = nn.Parameter(
torch.Tensor((hidden_size ) * 3))
self.query_key_value = WeightBiasContainer(attn_qkvw, attn_qkvb)
attn_ow = nn.Parameter(
torch.Tensor(hidden_size ,
hidden_size))
attn_ob = nn.Parameter(torch.Tensor(hidden_size))
self.dense = WeightBiasContainer(attn_ow, attn_ob)
pass
class DeepSpeedTransformerInferenceCustomLayer(nn.Module):
def __init__(self,
hidden_size,
heads,
layernorm_epsilon=1e-5):
super().__init__()
self.hidden_size = hidden_size
self.heads = heads
self.input_layernorm = WeightBiasContainer(
nn.Parameter(torch.Tensor(hidden_size)),
nn.Parameter(torch.Tensor(hidden_size)))
self.post_attention_layernorm = WeightBiasContainer(
nn.Parameter(torch.Tensor(hidden_size)),
nn.Parameter(torch.Tensor(hidden_size)))
self.attention = DeepSpeedSelfAttentionCustom(hidden_size,heads)
self.mlp = DeepSpeedMLPCustom(hidden_size, layernorm_epsilon=layernorm_epsilon)
class TransformerInferenceCustomLayer(nn.Module):
def __init__(self,
hidden_size,
heads,post_layer_norm = False,
layernorm_epsilon=1e-5):
super().__init__()
self.transformer = DeepSpeedTransformerInferenceCustomLayer(hidden_size,heads,layernorm_epsilon)
if post_layer_norm:
self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
else:
self.final_layernorm = None
def forward(self,*inputs,**kwargs):
outputs,(k,v) = self.transformer(*inputs,**kwargs)
return outputs,(k,v)
class Model(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.ModuleList([TransformerInferenceCustomLayer(128, 4)] * 3)
pass
def forward(self):
device = list(self.layers[0].parameters())[0].device
d = 10
(k,v)= None,None
output= None
tokens = torch.rand((1, 100 , 128), dtype=torch.float16, device=device)
for i in range(d):
t = time.time()
tokens = torch.rand((1, 100 , 128), dtype=torch.float16, device=device)
for l in range(len(self.layers)):
if i == 0:
output,(k,v) = self.layers[l](tokens,get_present=True,layer_past = (k,v) if k is not None else None)
else:
output,(k,v) = self.layers[l](torch.rand((1, 1, 128), dtype=torch.float16, device=device),get_present=True,layer_past = (k,v) if k is not None else None)
print(time.time() - t)
print(k)
print(output)
model = Model()
model = deepspeed.init_inference(model,
mp_size = 1,
replace_with_kernel_inject=True,
injection_policy={DeepSpeedTransformerInferenceCustomLayer: (TransformerInferenceCustomPolicy,)},
dtype=torch.float16)
sd = model.state_dict()
for k,v in sd.items():
if k[-1] != 'b':
sd[k] = torch.randn_like(v)
else:
sd[k] = torch.zeros_like(v)
model.load_state_dict(sd)
model()
- What packages are required and their versions
deepspeed==0.8.0 torch==1.13.1
- How to run the script run it as is, can run it in a notebook
- explanation: This script declares a policy and a basic structure to hold the model parameters before kernel replacement, After that, It loops through the layers, first cycle in the loop with full seq_len in input (prompt encoding) and after that it iterates with only seq_len = 1 ( classic k_v_cache behavior) The script prints the last key layer_past which shows 90% of the tensor is 0.
Expected behavior K,V cache will return a reasonable tensor without zeros.
ds_report output
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
runtime if needed. Op compatibility means that your system
meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
async_io ............... [NO] ....... [NO]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
[WARNING] please install triton==1.0.0 if you want to use sparse attention
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/home/morzusman/.pyenv/versions/3.8.5/envs/jurassic-serving/lib/python3.8/site-packages/torch']
torch version .................... 1.13.1+cu117
deepspeed install path ........... ['/home/morzusman/.pyenv/versions/3.8.5/envs/jurassic-serving/lib/python3.8/site-packages/deepspeed']
deepspeed info ................... 0.8.0, unknown, unknown
torch cuda version ............... 11.7
torch hip version ................ None
nvcc version ..................... 11.3
deepspeed wheel compiled w. ...... torch 1.13, cuda 11.7
Screenshots
Screen shot of the returned K tensor
System info (please complete the following information):
- OS: Debian 10
- GPU count and types 1xA100
- (if applicable) what DeepSpeed-MII version are you using
- (if applicable) Hugging Face Transformers/Accelerate/etc. versions
- Python version - Python 3.8.5
- Any other relevant info about your setup
Additional context
section 5
Please note that I'm using DeepSpeed version 0.8.0 which still has this issue - https://github.com/microsoft/DeepSpeed/issues/2602 ,
I solved it temporarily replacing this line policy = policy_cls(child, inference=inference)
with policy = policy_cls[0](child, inference=inference)
in line 335 (policy_module.py)
Opened a PR fixing this issue https://github.com/microsoft/DeepSpeed/pull/2828
Sorry for delays, your PR is in our merge queue now.