llvm
llvm copied to clipboard
[SYCL] Add __imf_rcp64h to intel math libdevice
Some deep learning framework uses '__nv_rcp64h' in CUDA backend. We need to provide equivalent functionality in DPC++ compiler.
Hi, @zettai-reido This PR adds correspondence for '__nv_rcp64h' in imf libdevice, could you help review it? Thanks very much.
Hi, @tfzhu Could you help check whether the pre-ci failure in Jenkins/Precommit is a infrastructure issue? Thanks very much.
Hi, @tfzhu Could you help check whether the pre-ci failure in Jenkins/Precommit is a infrastructure issue? Thanks very much.
Yes, @DoyleLi is working on the infra issue.
Hi @jinge90 . The issue is resolved.
Hello @jinge90
#include <cstdio>
#include <cstdint>
#include <initializer_list>
#include <vector>
extern "C"
__device__
double __nv_rcp64h(double a);
__device__
void calculate(const double* a, double* y) {
y[0] = __nv_rcp64h(a[0]);
}
__global__
void test_kernel(size_t n, const double* vec_a, double* vec_y) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= n) { return; } // tail
calculate(vec_a + i, vec_y + i);
}
int main() {
std::vector<double> a = { 2, 3, 5, 7, 11, 13, 17, 19 };
std::initializer_list<uint64_t> specials = {
0x7FF0'0000'0000'0000, // +Inf
0xFFF0'0000'0000'0000, // -Inf
0x7FF8'0000'0000'0000, // sNaN
0xFFF8'0000'0000'0000, // qNaN
0x7FEF'FFFF'FFFF'FFFF, // Huge
0xFFEF'FFFF'FFFF'FFFF, // -Huge
0x0010'0000'0000'0000, // smallest Normal
0x001E'EEEE'EEEE'EEEE, // small Normal
0x000F'FFFF'FFFF'FFFF, // largest denormal
0x0000'0000'0000'0001, // smallest denormal
0x8010'0000'0000'0000, // smallest Normal
0x800F'FFFF'FFFF'FFFF, // largest denormal
0x8000'0000'0000'0001, // smallest denormal
0x3FF0'0000'0000'0000, // +1
0xBFF0'0000'0000'0000, // -1
0x3FF8'0000'0000'0000, // +1.5
0xBFF8'0000'0000'0000, // -1.5
};
for (auto e: specials) {
double v;
memcpy(&v, &e, sizeof(v));
a.push_back(v);
}
std::vector<double> y(a.size(), 888);
void* ptr_a;
void* ptr_y;
cudaMalloc(&ptr_a, a.size() * sizeof(double));
cudaMalloc(&ptr_y, y.size() * sizeof(double));
cudaMemcpy(ptr_a, a.data(), a.size() * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(ptr_y, y.data(), y.size() * sizeof(double), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
int bs = 256;
int nb = (a.size() + bs - 1) / bs;
test_kernel<<<nb, bs>>>(a.size(), (const double*)ptr_a, (double*)ptr_y);
cudaDeviceSynchronize();
cudaMemcpy(y.data(), ptr_y, y.size() * sizeof(double), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int i = 0; i < a.size(); ++i) {
fprintf(stderr, "%24.13la => %24.13la\n", (double)a[i], (double)y[i]);
}
cudaFree(&ptr_y);
cudaFree(&ptr_a);
return 0;
}
and obtained following results:
0x1.0000000000000p+1 => 0x1.0000000000000p-1
0x1.8000000000000p+1 => 0x1.5555500000000p-2
0x1.4000000000000p+2 => 0x1.9999900000000p-3
0x1.c000000000000p+2 => 0x1.2492400000000p-3
0x1.6000000000000p+3 => 0x1.745d100000000p-4
0x1.a000000000000p+3 => 0x1.3b13b00000000p-4
0x1.1000000000000p+4 => 0x1.e1e1e00000000p-5
0x1.3000000000000p+4 => 0x1.af28600000000p-5
inf => 0x0.0000000000000p+0
-inf => -0x0.0000000000000p+0
nan => nan
-nan => nan
0x1.fffffffffffffp+1023 => 0x0.0000000000000p+0
-0x1.fffffffffffffp+1023 => -0x0.0000000000000p+0
0x1.0000000000000p-1022 => 0x1.0000000000000p+1022
0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3e00000000p+1021
0x0.fffffffffffffp-1022 => inf
0x0.0000000000001p-1022 => inf
-0x1.0000000000000p-1022 => -0x1.0000000000000p+1022
-0x0.fffffffffffffp-1022 => -inf
-0x0.0000000000001p-1022 => -inf
0x1.0000000000000p+0 => 0x1.0000000000000p+0
-0x1.0000000000000p+0 => -0x1.0000000000000p+0
0x1.8000000000000p+0 => 0x1.5555500000000p-1
-0x1.8000000000000p+0 => -0x1.5555500000000p-1
may you check your implementation against it?
Hello @jinge90
#include <cstdio> #include <cstdint> #include <initializer_list> #include <vector> extern "C" __device__ double __nv_rcp64h(double a); __device__ void calculate(const double* a, double* y) { y[0] = __nv_rcp64h(a[0]); } __global__ void test_kernel(size_t n, const double* vec_a, double* vec_y) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i >= n) { return; } // tail calculate(vec_a + i, vec_y + i); } int main() { std::vector<double> a = { 2, 3, 5, 7, 11, 13, 17, 19 }; std::initializer_list<uint64_t> specials = { 0x7FF0'0000'0000'0000, // +Inf 0xFFF0'0000'0000'0000, // -Inf 0x7FF8'0000'0000'0000, // sNaN 0xFFF8'0000'0000'0000, // qNaN 0x7FEF'FFFF'FFFF'FFFF, // Huge 0xFFEF'FFFF'FFFF'FFFF, // -Huge 0x0010'0000'0000'0000, // smallest Normal 0x001E'EEEE'EEEE'EEEE, // small Normal 0x000F'FFFF'FFFF'FFFF, // largest denormal 0x0000'0000'0000'0001, // smallest denormal 0x8010'0000'0000'0000, // smallest Normal 0x800F'FFFF'FFFF'FFFF, // largest denormal 0x8000'0000'0000'0001, // smallest denormal 0x3FF0'0000'0000'0000, // +1 0xBFF0'0000'0000'0000, // -1 0x3FF8'0000'0000'0000, // +1.5 0xBFF8'0000'0000'0000, // -1.5 }; for (auto e: specials) { double v; memcpy(&v, &e, sizeof(v)); a.push_back(v); } std::vector<double> y(a.size(), 888); void* ptr_a; void* ptr_y; cudaMalloc(&ptr_a, a.size() * sizeof(double)); cudaMalloc(&ptr_y, y.size() * sizeof(double)); cudaMemcpy(ptr_a, a.data(), a.size() * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(ptr_y, y.data(), y.size() * sizeof(double), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); int bs = 256; int nb = (a.size() + bs - 1) / bs; test_kernel<<<nb, bs>>>(a.size(), (const double*)ptr_a, (double*)ptr_y); cudaDeviceSynchronize(); cudaMemcpy(y.data(), ptr_y, y.size() * sizeof(double), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); for (int i = 0; i < a.size(); ++i) { fprintf(stderr, "%24.13la => %24.13la\n", (double)a[i], (double)y[i]); } cudaFree(&ptr_y); cudaFree(&ptr_a); return 0; }
and obtained following results:
0x1.0000000000000p+1 => 0x1.0000000000000p-1 0x1.8000000000000p+1 => 0x1.5555500000000p-2 0x1.4000000000000p+2 => 0x1.9999900000000p-3 0x1.c000000000000p+2 => 0x1.2492400000000p-3 0x1.6000000000000p+3 => 0x1.745d100000000p-4 0x1.a000000000000p+3 => 0x1.3b13b00000000p-4 0x1.1000000000000p+4 => 0x1.e1e1e00000000p-5 0x1.3000000000000p+4 => 0x1.af28600000000p-5 inf => 0x0.0000000000000p+0 -inf => -0x0.0000000000000p+0 nan => nan -nan => nan 0x1.fffffffffffffp+1023 => 0x0.0000000000000p+0 -0x1.fffffffffffffp+1023 => -0x0.0000000000000p+0 0x1.0000000000000p-1022 => 0x1.0000000000000p+1022 0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3e00000000p+1021 0x0.fffffffffffffp-1022 => inf 0x0.0000000000001p-1022 => inf -0x1.0000000000000p-1022 => -0x1.0000000000000p+1022 -0x0.fffffffffffffp-1022 => -inf -0x0.0000000000001p-1022 => -inf 0x1.0000000000000p+0 => 0x1.0000000000000p+0 -0x1.0000000000000p+0 => -0x1.0000000000000p+0 0x1.8000000000000p+0 => 0x1.5555500000000p-1 -0x1.8000000000000p+0 => -0x1.5555500000000p-1
may you check your implementation against it?
Hi, @zettai-reido I tried and got following results:
0x1.0000000000000p+1 => 0x1.0000000000000p-1
0x1.8000000000000p+1 => 0x1.5555500000000p-2
0x1.4000000000000p+2 => 0x1.9999900000000p-3
0x1.c000000000000p+2 => 0x1.2492400000000p-3
0x1.6000000000000p+3 => 0x1.745d100000000p-4
0x1.a000000000000p+3 => 0x1.3b13b00000000p-4
0x1.1000000000000p+4 => 0x1.e1e1e00000000p-5
0x1.3000000000000p+4 => 0x1.af28600000000p-5
inf => 0x0.0000000000000p+0
-inf => -0x0.0000000000000p+0
nan => nan
-nan => **-nan**
0x1.fffffffffffffp+1023 => **0x0.4000000000000p-1022**
-0x1.fffffffffffffp+1023 => **-0x0.4000000000000p-1022**
0x1.0000000000000p-1022 => 0x1.0000000000000p+1022
0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3d00000000p+1021
0x0.fffffffffffffp-1022 => **0x1.0000000000000p+1022**
0x0.0000000000001p-1022 => inf
-0x1.0000000000000p-1022 => -0x1.0000000000000p+1022
-0x0.fffffffffffffp-1022 => **-0x1.0000000000000p+1022**
-0x0.0000000000001p-1022 => -inf
0x1.0000000000000p+0 => 0x1.0000000000000p+0
-0x1.0000000000000p+0 => -0x1.0000000000000p+0
0x1.8000000000000p+0 => 0x1.5555500000000p-1
-0x1.8000000000000p+0 => -0x1.5555500000000p-1
I double checked the mismatched cases and print the result of corresponding reciprocal value, it looks like my local result aligns with 'upper 32bits' of corresponding reciprocal value. Do you have any idea why CUDA's result is different?
@jinge90 I believe it flushes denormals to zero in source and destination and utilizes a slightly different table. 1-bit differences are unavoidable:
0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3e00000000p+1021
vs
0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3d00000000p+1021
@jinge90 I believe it flushes denormals to zero in source and destination and utilizes a slightly different table.
1-bit differences are unavoidable:
0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3e00000000p+1021 vs 0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3d00000000p+1021
Hi,Andrey What do you mean by "utilizes a different table"?
@jinge90 rcp64h provides initial value for division algorithm to work. Sometimes such algorithms are implemented as table-lookup.
0x1.08d3e00000000p-1
real math infinite prec. machine result
1.0 / 5.0 = 1.999999999999......p-3 => 0x1.9999900000000p-3
1.0 / 7.0 = 1.249249249249....p-3 => 0x1.2492400000000p-3
2251799813685248/4353479639791479 = 1.08D3D.......CB08D3DD306 => 1.08D3Ep-1
It rounded up last result but left first and second as-is.
@jinge90 rcp64h provides initial value for division algorithm to work. Sometimes such algorithms are implemented as table-lookup.
0x1.08d3e00000000p-1 real math infinite prec. machine result 1.0 / 5.0 = 1.999999999999......p-3 => 0x1.9999900000000p-3 1.0 / 7.0 = 1.249249249249....p-3 => 0x1.2492400000000p-3 2251799813685248/4353479639791479 = 1.08D3D.......CB08D3DD306 => 1.08D3Ep-1
It rounded up last result but left first and second as-is.
Hi, @zettai-reido I also observed that not all values were using the same rounding rules for rcp64h on CUDA platform and it seems that we can't guarantee the exact same behavior as CUDA, 1-bit difference is unavoidable. Do you think we can merge current patch? Or do you have any idea about how we can improve/enhance current implementation? Thanks very much.
I suggest to even out with FTZ/DAZ behavior. I wrote this quickly to test new behavior.
void emulate(const double* a, double* y) {
uint64_t xa = 0;
uint64_t xy = 0;
double sa = a[0];
memcpy(&xa, &sa, sizeof(xa));
int ea = (xa >> 52) & 0x7FF;
if (0 == ea) { sa = 0.0; }
else if (0x7FF == ea && (xa & 0x000F'FFFF'FFFF'FFFF)) {
xy = xa & 0x7FFF'FFFF'FFFF'FFFF;
memcpy(&y[0], &xy, sizeof(y[0]));
return;
}
double sy = 1.0 / sa;
memcpy(&xy, &sy, sizeof(xy));
uint64_t xy_high = (xy >> 32);
if (((xy >> 52) & 0x7FF) == 0) { xy_high = 0; }
xy = (xy_high << 32);
xy |= (xa & 0x8000'0000'0000'0000);
memcpy(&sy, &xy, sizeof(sy));
y[0] = sy;
}
I suggest to even out with FTZ/DAZ behavior. I wrote this quickly to test new behavior.
void emulate(const double* a, double* y) { uint64_t xa = 0; uint64_t xy = 0; double sa = a[0]; memcpy(&xa, &sa, sizeof(xa)); int ea = (xa >> 52) & 0x7FF; if (0 == ea) { sa = 0.0; } else if (0x7FF == ea && (xa & 0x000F'FFFF'FFFF'FFFF)) { xy = xa & 0x7FFF'FFFF'FFFF'FFFF; memcpy(&y[0], &xy, sizeof(y[0])); return; } double sy = 1.0 / sa; memcpy(&xy, &sy, sizeof(xy)); uint64_t xy_high = (xy >> 32); if (((xy >> 52) & 0x7FF) == 0) { xy_high = 0; } xy = (xy_high << 32); xy |= (xa & 0x8000'0000'0000'0000); memcpy(&sy, &xy, sizeof(sy)); y[0] = sy; }
Hi, @zettai-reido I updated the patch to apply 'ftz/daz' to the implementation and the results are:
0x1.0000000000000p+1 => 0x1.0000000000000p-1
0x1.8000000000000p+1 => 0x1.5555500000000p-2
0x1.4000000000000p+2 => 0x1.9999900000000p-3
0x1.c000000000000p+2 => 0x1.2492400000000p-3
0x1.6000000000000p+3 => 0x1.745d100000000p-4
0x1.a000000000000p+3 => 0x1.3b13b00000000p-4
0x1.1000000000000p+4 => 0x1.e1e1e00000000p-5
0x1.3000000000000p+4 => 0x1.af28600000000p-5
inf => 0x0.0000000000000p+0
-inf => -0x0.0000000000000p+0
nan => nan
-nan => nan
0x1.fffffffffffffp+1023 => 0x0.0000000000000p+0
-0x1.fffffffffffffp+1023 => -0x0.0000000000000p+0
0x1.0000000000000p-1022 => 0x1.0000000000000p+1022
0x1.eeeeeeeeeeeeep-1022 => 0x1.08d3d00000000p+1021
0x0.fffffffffffffp-1022 => inf
0x0.0000000000001p-1022 => inf
-0x1.0000000000000p-1022 => -0x1.0000000000000p+1022
-0x0.fffffffffffffp-1022 => -inf
-0x0.0000000000001p-1022 => -inf
0x1.0000000000000p+0 => 0x1.0000000000000p+0
-0x1.0000000000000p+0 => -0x1.0000000000000p+0
0x1.8000000000000p+0 => 0x1.5555500000000p-1
-0x1.8000000000000p+0 => -0x1.5555500000000p-1
Hi, @intel/dpcpp-tools-reviewers , @intel/llvm-reviewers-runtime and @aelovikov-intel Could you help review this patch? Thanks very much.
Hi, @intel/dpcpp-tools-reviewers Could you help review this patch? Thanks very much.
Hi, @intel/dpcpp-tools-reviewers Kind ping~. Thanks very much.
Hi, @intel/dpcpp-tools-reviewers Kind ping~. Thanks very much.