FasterTransformer In fp16 mode, slice the result of vit, but the result is nan

Hi, I want to use my vit in fasttransformer, at the end of the network, I want to directly output some features but not al. so I implemented a cuda function. The result and dimension of fp32 match torch, but the result of fp16 operation is nan, could you help me find what is causing it?

// my implementation
vit_ouput.size(batch, 193, 768)
my_output= vit_ouput[:, 0] 
my_output.size(batch, 768)

// int vit.cc ，after invokeGeneralLayerNorm， my func is invokesplitout
    invokeGeneralLayerNorm(need_padding ? norm_out_buf : last_out_buf,
                           from_buf,
                           weights->post_transformer_layernorm_weights.gamma,
                           weights->post_transformer_layernorm_weights.beta,
                           h_token_num,
                           embed_dim_,
                           stream_);

    int n = embed_dim_;
    int s = seq_len;
    int m = input_batch_size * s;
    if (need_padding) 
    {
        invokeRemovePadding(last_out_buf, norm_out_buf, padding_offset_, nopad_token_num_, head_num_ * head_dim_, stream_);
        invokesplitout(last_out_buf, output, m, n, s, stream_);
    }
    else
    {   
        invokesplitout(last_out_buf, output, m, n, s, stream_);
    }

// invokesplitout
// I have tried to use const half* __restrict  in, const half* __restrict  out when it's fp16,  but also nan. Anyway the fp32 is normal.
template<typename T>
__global__ void splitout(const T*  in, 
                            T*  out,
                            const int m, 
                            const int n, 
                            const int s)
{   
    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) 
    {   
        int col_idx = id % n;
        int row_idx = id / n / s;
        int tar_row_idx = row_idx * n;
        int tar_idx = tar_row_idx + col_idx;
        out[id] = in[tar_idx];
    }
}

template<>
__global__ void splitout(const half*  in, 
                            half*  out, 
                            const int m, 
                            const int n, 
                            const int s)
{   

    half2* out_ptr = (half2*)out;
    const half2* in_ptr = (half2*)in;
    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) 
    { 
        int col_idx = id % n;
        int row_idx = id / n / s;
    
        int tar_row_idx = row_idx * n;
        int tar_idx = tar_row_idx + col_idx;
        half2 d1 = in_ptr[tar_idx];
        out_ptr [id] = d1;
        // printf("value=%lu\n", d1);
    }
}

template<typename T>
void invokesplitout(
    const T* in, T* out, const int m, const int n, const int s, cudaStream_t stream)
{   
    const int data_type_factor = 4 / sizeof(T);  // 1 for fp32, 2 for fp16
    dim3 block, grid;
    if (n / 4 / data_type_factor <= 1024) {
        block.x = n / 4 / data_type_factor;
        grid.x = m;
    }
    else {
        block.x = 1024;
        grid.x = (m * n + 1023) / 1024;
    }
    // splitout<<<grid, block, 0, stream>>>(in, out, m, n, s);
    splitout<<<grid, block, 0, stream>>>(in, out, m, n / data_type_factor, s);
}

template void invokesplitout(const float* in, float* out, const int m, const int n, const int s, cudaStream_t stream);

template void invokesplitout(const half* in, half* out, const int m, const int n, const int s, cudaStream_t stream);

May 12 '22 17:05 HollrayChan

What's your n ?

May 13 '22 04:05 byshiue

What's your n ?

n is embed_dim_=768 s is seq_len=193

May 13 '22 04:05 HollrayChan

I call vit in tensorrt,so the following modifications are also required.

// vit.cc
FT_CHECK(output_tensors->at(0).shape.size() == 2);
// ViTPlugin.cpp

    std::vector<Tensor> output_tensors = std::vector<Tensor>{
        Tensor{MEMORY_GPU,
               getTensorType<T>(),
            //    std::vector<size_t>{(size_t)batch_size, (size_t)settings_.seq_len, (size_t)settings_.embed_dim},
               std::vector<size_t>{(size_t)batch_size, (size_t)settings_.embed_dim},
               (T*)(outputs[0])}};

    // Input is B*in_chans*H*W, output should be B*seq_len*embed_dim*1
    assert(outputIndex == 0);
    DimsExprs output;
    output.nbDims = 2;
    output.d[0] = inputs[0].d[0];
    output.d[1] = exprBuilder.constant(settings_.embed_dim);

    // output.nbDims = 3;
    // output.d[0] = inputs[0].d[0];
    // output.d[1] = exprBuilder.constant(settings_.seq_len);
    // output.d[2] = exprBuilder.constant(settings_.embed_dim);

May 13 '22 04:05 HollrayChan

Try to run without half2.

May 13 '22 04:05 byshiue

I modified it to the following form, but the result is nan.

template<>
__global__ void splitout(const half*  in,  
                            half*  out, 
                            const int m, 
                            const int n, 
                            const int s)
{   

    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) 
    { 
        int col_idx = id % n;
        int row_idx = id / n / s;
    
        int tar_row_idx = row_idx * n;
        int tar_idx = tar_row_idx + col_idx;
        out[id] = in[tar_idx];
        // printf("value=%lu\n", d1);
    }
}

May 13 '22 04:05 HollrayChan

For half, you need to set data_type_factor to be 1.

May 13 '22 04:05 byshiue

ok, I set const int data_type_factor = 1, but still nan.

May 13 '22 04:05 HollrayChan

You can print the values of inputs and outputs of your slice kernel under FP32 and FP16.

May 15 '22 23:05 byshiue

Thx，something wrong in my splitout, I have fixed it.

May 16 '22 02:05 HollrayChan

Close this bug because it is inactivated. Feel free to re-open this issue if you still have any problem.

Sep 06 '22 01:09 byshiue

FasterTransformer FasterTransformer copied to clipboard

In fp16 mode, slice the result of vit, but the result is nan

FasterTransformer
FasterTransformer copied to clipboard