lightseq
lightseq copied to clipboard
Is post_ln=1 supported for inference?
Is post_ln=1 supported for inference with protobuf file? According to the code below, it seems post_ln=1 is not correctly supported. The input is still pre-normalized.
template <typename T>
__global__ void ker_norm_layer_resual(T* input, T* output, const T* scale,
const T* bias, const T* residual_bias,
const int hidden_size, bool is_post_ln) {
uint block_start = blockIdx.x * hidden_size;
uint start = block_start + threadIdx.x;
uint end = block_start + hidden_size;
float val = 0.0;
for (uint i = start; i < end; i += blockDim.x) {
val += input[i];
}
// step 0. compute mean
__shared__ float s_mean;
float reduce_res = blockReduceSum<float>(val);
if (threadIdx.x == 0) s_mean = reduce_res / float(hidden_size);
__syncthreads();
// step 1. compute variance
val = 0.0;
for (uint i = start; i < end; i += blockDim.x) {
float tmp = input[i] - s_mean;
val += tmp * tmp;
}
__shared__ float s_var;
reduce_res = blockReduceSum(val);
if (threadIdx.x == 0)
s_var = rsqrtf(reduce_res / float(hidden_size) + epsilon);
__syncthreads();
// step 2. layer norm
for (uint i = start; i < end; i += blockDim.x) {
val = input[i] - s_mean;
output[i] = val * s_var * __ldg(&scale[i - block_start]) + // Though post_ln=1, the input is pre-normalzed and is assigned to out?
__ldg(&bias[i - block_start]);
if (is_post_ln) {
input[i] = output[i] + __ldg(&residual_bias[i - block_start]);
} else {
input[i] += __ldg(&residual_bias[i - block_start]);
}
}
}
Post-normalized is supported by post_ln=1. The comments "though post_ln=1 ..." is out of date.
Supported, you can test bert example wich is post-norm.
On Sat, Apr 9, 2022 at 5:56 PM Jin @.***> wrote:
Is post_ln=1 supported for inference with protobuf file? According to the code below, it seems post_ln=1 is not correctly supported. The input is still pre-normalized.
template <typename T> global void ker_norm_layer_resual(T* input, T* output, const T* scale, const T* bias, const T* residual_bias, const int hidden_size, bool is_post_ln) { uint block_start = blockIdx.x * hidden_size; uint start = block_start + threadIdx.x; uint end = block_start + hidden_size; float val = 0.0; for (uint i = start; i < end; i += blockDim.x) { val += input[i]; }
// step 0. compute mean shared float s_mean; float reduce_res = blockReduceSum
(val); if (threadIdx.x == 0) s_mean = reduce_res / float(hidden_size); __syncthreads(); // step 1. compute variance val = 0.0; for (uint i = start; i < end; i += blockDim.x) { float tmp = input[i] - s_mean; val += tmp * tmp; } shared float s_var; reduce_res = blockReduceSum(val); if (threadIdx.x == 0) s_var = rsqrtf(reduce_res / float(hidden_size) + epsilon); __syncthreads();
// step 2. layer norm for (uint i = start; i < end; i += blockDim.x) { val = input[i] - s_mean; output[i] = val * s_var * __ldg(&scale[i - block_start]) + // Though post_ln=1, the input is pre-normalzed and is assigned to out? __ldg(&bias[i - block_start]); if (is_post_ln) { input[i] = output[i] + __ldg(&residual_bias[i - block_start]); } else { input[i] += __ldg(&residual_bias[i - block_start]); } } }
— Reply to this email directly, view it on GitHub https://github.com/bytedance/lightseq/issues/291, or unsubscribe https://github.com/notifications/unsubscribe-auth/AELIZAL7W5OBBLYEWTAEPM3VEFH57ANCNFSM5S6YHB6Q . You are receiving this because you are subscribed to this thread.Message ID: @.***>