FasterTransformer
FasterTransformer copied to clipboard
In fp16 mode, slice the result of vit, but the result is nan
Hi, I want to use my vit in fasttransformer, at the end of the network, I want to directly output some features but not al. so I implemented a cuda function. The result and dimension of fp32 match torch, but the result of fp16 operation is nan, could you help me find what is causing it?
// my implementation
vit_ouput.size(batch, 193, 768)
my_output= vit_ouput[:, 0]
my_output.size(batch, 768)
// int vit.cc ,after invokeGeneralLayerNorm, my func is invokesplitout
invokeGeneralLayerNorm(need_padding ? norm_out_buf : last_out_buf,
from_buf,
weights->post_transformer_layernorm_weights.gamma,
weights->post_transformer_layernorm_weights.beta,
h_token_num,
embed_dim_,
stream_);
int n = embed_dim_;
int s = seq_len;
int m = input_batch_size * s;
if (need_padding)
{
invokeRemovePadding(last_out_buf, norm_out_buf, padding_offset_, nopad_token_num_, head_num_ * head_dim_, stream_);
invokesplitout(last_out_buf, output, m, n, s, stream_);
}
else
{
invokesplitout(last_out_buf, output, m, n, s, stream_);
}
// invokesplitout
// I have tried to use const half* __restrict in, const half* __restrict out when it's fp16, but also nan. Anyway the fp32 is normal.
template<typename T>
__global__ void splitout(const T* in,
T* out,
const int m,
const int n,
const int s)
{
for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x)
{
int col_idx = id % n;
int row_idx = id / n / s;
int tar_row_idx = row_idx * n;
int tar_idx = tar_row_idx + col_idx;
out[id] = in[tar_idx];
}
}
template<>
__global__ void splitout(const half* in,
half* out,
const int m,
const int n,
const int s)
{
half2* out_ptr = (half2*)out;
const half2* in_ptr = (half2*)in;
for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x)
{
int col_idx = id % n;
int row_idx = id / n / s;
int tar_row_idx = row_idx * n;
int tar_idx = tar_row_idx + col_idx;
half2 d1 = in_ptr[tar_idx];
out_ptr [id] = d1;
// printf("value=%lu\n", d1);
}
}
template<typename T>
void invokesplitout(
const T* in, T* out, const int m, const int n, const int s, cudaStream_t stream)
{
const int data_type_factor = 4 / sizeof(T); // 1 for fp32, 2 for fp16
dim3 block, grid;
if (n / 4 / data_type_factor <= 1024) {
block.x = n / 4 / data_type_factor;
grid.x = m;
}
else {
block.x = 1024;
grid.x = (m * n + 1023) / 1024;
}
// splitout<<<grid, block, 0, stream>>>(in, out, m, n, s);
splitout<<<grid, block, 0, stream>>>(in, out, m, n / data_type_factor, s);
}
template void invokesplitout(const float* in, float* out, const int m, const int n, const int s, cudaStream_t stream);
template void invokesplitout(const half* in, half* out, const int m, const int n, const int s, cudaStream_t stream);
What's your n ?
What's your n ?
n is embed_dim_=768 s is seq_len=193
I call vit in tensorrt,so the following modifications are also required.
// vit.cc
FT_CHECK(output_tensors->at(0).shape.size() == 2);
// ViTPlugin.cpp
std::vector<Tensor> output_tensors = std::vector<Tensor>{
Tensor{MEMORY_GPU,
getTensorType<T>(),
// std::vector<size_t>{(size_t)batch_size, (size_t)settings_.seq_len, (size_t)settings_.embed_dim},
std::vector<size_t>{(size_t)batch_size, (size_t)settings_.embed_dim},
(T*)(outputs[0])}};
// Input is B*in_chans*H*W, output should be B*seq_len*embed_dim*1
assert(outputIndex == 0);
DimsExprs output;
output.nbDims = 2;
output.d[0] = inputs[0].d[0];
output.d[1] = exprBuilder.constant(settings_.embed_dim);
// output.nbDims = 3;
// output.d[0] = inputs[0].d[0];
// output.d[1] = exprBuilder.constant(settings_.seq_len);
// output.d[2] = exprBuilder.constant(settings_.embed_dim);
Try to run without half2.
I modified it to the following form, but the result is nan.
template<>
__global__ void splitout(const half* in,
half* out,
const int m,
const int n,
const int s)
{
for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x)
{
int col_idx = id % n;
int row_idx = id / n / s;
int tar_row_idx = row_idx * n;
int tar_idx = tar_row_idx + col_idx;
out[id] = in[tar_idx];
// printf("value=%lu\n", d1);
}
}
For half, you need to set data_type_factor to be 1.
ok, I set const int data_type_factor = 1, but still nan.
You can print the values of inputs and outputs of your slice kernel under FP32 and FP16.
Thx,something wrong in my splitout, I have fixed it.
Close this bug because it is inactivated. Feel free to re-open this issue if you still have any problem.