llvm [SYCL] Add __imf_rcp64h to intel math libdevice

Some deep learning framework uses '__nv_rcp64h' in CUDA backend. We need to provide equivalent functionality in DPC++ compiler.

Oct 20 '23 02:10 jinge90

Hi, @zettai-reido This PR adds correspondence for '__nv_rcp64h' in imf libdevice, could you help review it? Thanks very much.

Feb 19 '24 02:02 jinge90

Hi, @tfzhu Could you help check whether the pre-ci failure in Jenkins/Precommit is a infrastructure issue? Thanks very much.

Feb 19 '24 06:02 jinge90

Hi, @tfzhu Could you help check whether the pre-ci failure in Jenkins/Precommit is a infrastructure issue? Thanks very much.

Yes, @DoyleLi is working on the infra issue.

Feb 19 '24 06:02 tfzhu

Hi @jinge90 . The issue is resolved.

Feb 19 '24 06:02 DoyleLi

Hello @jinge90

#include <cstdio>
#include <cstdint>
#include <initializer_list>
#include <vector>

extern "C"
__device__
double __nv_rcp64h(double a);

__device__
void calculate(const double* a, double* y) {
    y[0] = __nv_rcp64h(a[0]);
}

__global__
void test_kernel(size_t n, const double* vec_a, double* vec_y) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= n) { return; } // tail
    calculate(vec_a + i, vec_y + i);
}

int main() {
    std::vector<double> a = { 2, 3, 5, 7, 11, 13, 17, 19 };
    std::initializer_list<uint64_t> specials = {
        0x7FF0'0000'0000'0000, // +Inf
        0xFFF0'0000'0000'0000, // -Inf
        0x7FF8'0000'0000'0000, // sNaN
        0xFFF8'0000'0000'0000, // qNaN

        0x7FEF'FFFF'FFFF'FFFF, // Huge
        0xFFEF'FFFF'FFFF'FFFF, // -Huge

        0x0010'0000'0000'0000, // smallest Normal
        0x001E'EEEE'EEEE'EEEE, // small Normal
        0x000F'FFFF'FFFF'FFFF, // largest denormal
        0x0000'0000'0000'0001, // smallest denormal

        0x8010'0000'0000'0000, // smallest Normal
        0x800F'FFFF'FFFF'FFFF, // largest denormal
        0x8000'0000'0000'0001, // smallest denormal

        0x3FF0'0000'0000'0000, // +1
        0xBFF0'0000'0000'0000, // -1
        0x3FF8'0000'0000'0000, // +1.5
        0xBFF8'0000'0000'0000, // -1.5
    };

    for (auto e: specials) {
        double v;
        memcpy(&v, &e, sizeof(v));
        a.push_back(v);
    }

    std::vector<double> y(a.size(), 888);

    void* ptr_a;
    void* ptr_y;

    cudaMalloc(&ptr_a, a.size() * sizeof(double));
    cudaMalloc(&ptr_y, y.size() * sizeof(double));

    cudaMemcpy(ptr_a, a.data(), a.size() * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(ptr_y, y.data(), y.size() * sizeof(double), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    int bs = 256;
    int nb = (a.size() + bs - 1) / bs;

    test_kernel<<<nb, bs>>>(a.size(), (const double*)ptr_a, (double*)ptr_y);
    cudaDeviceSynchronize();
    cudaMemcpy(y.data(), ptr_y, y.size() * sizeof(double), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();

    for (int i = 0; i < a.size(); ++i) {
        fprintf(stderr, "%24.13la => %24.13la\n", (double)a[i], (double)y[i]);
    }

    cudaFree(&ptr_y);
    cudaFree(&ptr_a);
    return 0;
}

and obtained following results:

    0x1.0000000000000p+1 =>     0x1.0000000000000p-1
    0x1.8000000000000p+1 =>     0x1.5555500000000p-2
    0x1.4000000000000p+2 =>     0x1.9999900000000p-3
    0x1.c000000000000p+2 =>     0x1.2492400000000p-3
    0x1.6000000000000p+3 =>     0x1.745d100000000p-4
    0x1.a000000000000p+3 =>     0x1.3b13b00000000p-4
    0x1.1000000000000p+4 =>     0x1.e1e1e00000000p-5
    0x1.3000000000000p+4 =>     0x1.af28600000000p-5
                     inf =>     0x0.0000000000000p+0
                    -inf =>    -0x0.0000000000000p+0
                     nan =>                      nan
                    -nan =>                      nan
 0x1.fffffffffffffp+1023 =>     0x0.0000000000000p+0
-0x1.fffffffffffffp+1023 =>    -0x0.0000000000000p+0
 0x1.0000000000000p-1022 =>  0x1.0000000000000p+1022
 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3e00000000p+1021
 0x0.fffffffffffffp-1022 =>                      inf
 0x0.0000000000001p-1022 =>                      inf
-0x1.0000000000000p-1022 => -0x1.0000000000000p+1022
-0x0.fffffffffffffp-1022 =>                     -inf
-0x0.0000000000001p-1022 =>                     -inf
    0x1.0000000000000p+0 =>     0x1.0000000000000p+0
   -0x1.0000000000000p+0 =>    -0x1.0000000000000p+0
    0x1.8000000000000p+0 =>     0x1.5555500000000p-1
   -0x1.8000000000000p+0 =>    -0x1.5555500000000p-1

may you check your implementation against it?

Feb 19 '24 08:02 zettai-reido

Hello @jinge90

#include <cstdio>
#include <cstdint>
#include <initializer_list>
#include <vector>

extern "C"
__device__
double __nv_rcp64h(double a);

__device__
void calculate(const double* a, double* y) {
    y[0] = __nv_rcp64h(a[0]);
}

__global__
void test_kernel(size_t n, const double* vec_a, double* vec_y) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= n) { return; } // tail
    calculate(vec_a + i, vec_y + i);
}

int main() {
    std::vector<double> a = { 2, 3, 5, 7, 11, 13, 17, 19 };
    std::initializer_list<uint64_t> specials = {
        0x7FF0'0000'0000'0000, // +Inf
        0xFFF0'0000'0000'0000, // -Inf
        0x7FF8'0000'0000'0000, // sNaN
        0xFFF8'0000'0000'0000, // qNaN

        0x7FEF'FFFF'FFFF'FFFF, // Huge
        0xFFEF'FFFF'FFFF'FFFF, // -Huge

        0x0010'0000'0000'0000, // smallest Normal
        0x001E'EEEE'EEEE'EEEE, // small Normal
        0x000F'FFFF'FFFF'FFFF, // largest denormal
        0x0000'0000'0000'0001, // smallest denormal

        0x8010'0000'0000'0000, // smallest Normal
        0x800F'FFFF'FFFF'FFFF, // largest denormal
        0x8000'0000'0000'0001, // smallest denormal

        0x3FF0'0000'0000'0000, // +1
        0xBFF0'0000'0000'0000, // -1
        0x3FF8'0000'0000'0000, // +1.5
        0xBFF8'0000'0000'0000, // -1.5
    };

    for (auto e: specials) {
        double v;
        memcpy(&v, &e, sizeof(v));
        a.push_back(v);
    }

    std::vector<double> y(a.size(), 888);

    void* ptr_a;
    void* ptr_y;

    cudaMalloc(&ptr_a, a.size() * sizeof(double));
    cudaMalloc(&ptr_y, y.size() * sizeof(double));

    cudaMemcpy(ptr_a, a.data(), a.size() * sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(ptr_y, y.data(), y.size() * sizeof(double), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    int bs = 256;
    int nb = (a.size() + bs - 1) / bs;

    test_kernel<<<nb, bs>>>(a.size(), (const double*)ptr_a, (double*)ptr_y);
    cudaDeviceSynchronize();
    cudaMemcpy(y.data(), ptr_y, y.size() * sizeof(double), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();

    for (int i = 0; i < a.size(); ++i) {
        fprintf(stderr, "%24.13la => %24.13la\n", (double)a[i], (double)y[i]);
    }

    cudaFree(&ptr_y);
    cudaFree(&ptr_a);
    return 0;
}

and obtained following results:

    0x1.0000000000000p+1 =>     0x1.0000000000000p-1
    0x1.8000000000000p+1 =>     0x1.5555500000000p-2
    0x1.4000000000000p+2 =>     0x1.9999900000000p-3
    0x1.c000000000000p+2 =>     0x1.2492400000000p-3
    0x1.6000000000000p+3 =>     0x1.745d100000000p-4
    0x1.a000000000000p+3 =>     0x1.3b13b00000000p-4
    0x1.1000000000000p+4 =>     0x1.e1e1e00000000p-5
    0x1.3000000000000p+4 =>     0x1.af28600000000p-5
                     inf =>     0x0.0000000000000p+0
                    -inf =>    -0x0.0000000000000p+0
                     nan =>                      nan
                    -nan =>                      nan
 0x1.fffffffffffffp+1023 =>     0x0.0000000000000p+0
-0x1.fffffffffffffp+1023 =>    -0x0.0000000000000p+0
 0x1.0000000000000p-1022 =>  0x1.0000000000000p+1022
 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3e00000000p+1021
 0x0.fffffffffffffp-1022 =>                      inf
 0x0.0000000000001p-1022 =>                      inf
-0x1.0000000000000p-1022 => -0x1.0000000000000p+1022
-0x0.fffffffffffffp-1022 =>                     -inf
-0x0.0000000000001p-1022 =>                     -inf
    0x1.0000000000000p+0 =>     0x1.0000000000000p+0
   -0x1.0000000000000p+0 =>    -0x1.0000000000000p+0
    0x1.8000000000000p+0 =>     0x1.5555500000000p-1
   -0x1.8000000000000p+0 =>    -0x1.5555500000000p-1

may you check your implementation against it?

Hi, @zettai-reido I tried and got following results:

    0x1.0000000000000p+1 =>     0x1.0000000000000p-1
    0x1.8000000000000p+1 =>     0x1.5555500000000p-2
    0x1.4000000000000p+2 =>     0x1.9999900000000p-3
    0x1.c000000000000p+2 =>     0x1.2492400000000p-3
    0x1.6000000000000p+3 =>     0x1.745d100000000p-4
    0x1.a000000000000p+3 =>     0x1.3b13b00000000p-4
    0x1.1000000000000p+4 =>     0x1.e1e1e00000000p-5
    0x1.3000000000000p+4 =>     0x1.af28600000000p-5
                     inf =>     0x0.0000000000000p+0
                    -inf =>    -0x0.0000000000000p+0
                     nan =>                      nan
                    -nan =>                     **-nan**
 0x1.fffffffffffffp+1023 =>  **0x0.4000000000000p-1022**
-0x1.fffffffffffffp+1023 => **-0x0.4000000000000p-1022**
 0x1.0000000000000p-1022 =>  0x1.0000000000000p+1022
 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3d00000000p+1021
 0x0.fffffffffffffp-1022 =>  **0x1.0000000000000p+1022**
 0x0.0000000000001p-1022 =>                      inf
-0x1.0000000000000p-1022 => -0x1.0000000000000p+1022
-0x0.fffffffffffffp-1022 => **-0x1.0000000000000p+1022**
-0x0.0000000000001p-1022 =>                     -inf
    0x1.0000000000000p+0 =>     0x1.0000000000000p+0
   -0x1.0000000000000p+0 =>    -0x1.0000000000000p+0
    0x1.8000000000000p+0 =>     0x1.5555500000000p-1
   -0x1.8000000000000p+0 =>    -0x1.5555500000000p-1

I double checked the mismatched cases and print the result of corresponding reciprocal value, it looks like my local result aligns with 'upper 32bits' of corresponding reciprocal value. Do you have any idea why CUDA's result is different?

Feb 19 '24 09:02 jinge90

@jinge90 I believe it flushes denormals to zero in source and destination and utilizes a slightly different table. 1-bit differences are unavoidable:

 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3e00000000p+1021
vs
 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3d00000000p+1021

Feb 19 '24 10:02 zettai-reido

@jinge90 I believe it flushes denormals to zero in source and destination and utilizes a slightly different table.

1-bit differences are unavoidable:
 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3e00000000p+1021

vs

 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3d00000000p+1021

Hi,Andrey What do you mean by "utilizes a different table"?

Feb 19 '24 14:02 jinge90

@jinge90 rcp64h provides initial value for division algorithm to work. Sometimes such algorithms are implemented as table-lookup.

0x1.08d3e00000000p-1


 real math           infinite prec.                   machine result                                                  
1.0 / 5.0 = 1.999999999999......p-3 =>  0x1.9999900000000p-3
1.0 / 7.0 = 1.249249249249....p-3  => 0x1.2492400000000p-3
 2251799813685248/4353479639791479 = 1.08D3D.......CB08D3DD306 => 1.08D3Ep-1

It rounded up last result but left first and second as-is.

Feb 20 '24 01:02 zettai-reido

@jinge90 rcp64h provides initial value for division algorithm to work. Sometimes such algorithms are implemented as table-lookup.
0x1.08d3e00000000p-1


 real math           infinite prec.                   machine result                                                  
1.0 / 5.0 = 1.999999999999......p-3 =>  0x1.9999900000000p-3
1.0 / 7.0 = 1.249249249249....p-3  => 0x1.2492400000000p-3
 2251799813685248/4353479639791479 = 1.08D3D.......CB08D3DD306 => 1.08D3Ep-1
It rounded up last result but left first and second as-is.

Hi, @zettai-reido I also observed that not all values were using the same rounding rules for rcp64h on CUDA platform and it seems that we can't guarantee the exact same behavior as CUDA, 1-bit difference is unavoidable. Do you think we can merge current patch? Or do you have any idea about how we can improve/enhance current implementation? Thanks very much.

Feb 20 '24 01:02 jinge90

I suggest to even out with FTZ/DAZ behavior. I wrote this quickly to test new behavior.

void emulate(const double* a, double* y) {
    uint64_t xa = 0;
    uint64_t xy = 0;

    double sa = a[0];
    memcpy(&xa, &sa, sizeof(xa));

    int ea = (xa >> 52) & 0x7FF;
    if (0 == ea) { sa = 0.0; }
    else if (0x7FF == ea && (xa & 0x000F'FFFF'FFFF'FFFF)) {
        xy = xa & 0x7FFF'FFFF'FFFF'FFFF;
        memcpy(&y[0], &xy, sizeof(y[0]));
        return;
    }

    double sy = 1.0 / sa;
    memcpy(&xy, &sy, sizeof(xy));
    uint64_t xy_high = (xy >> 32);

    if (((xy >> 52) & 0x7FF) == 0) { xy_high = 0; }
    xy = (xy_high << 32);

    xy |= (xa & 0x8000'0000'0000'0000);
    memcpy(&sy, &xy, sizeof(sy));
    y[0] = sy;
}

Feb 20 '24 05:02 zettai-reido

I suggest to even out with FTZ/DAZ behavior. I wrote this quickly to test new behavior.

void emulate(const double* a, double* y) {
    uint64_t xa = 0;
    uint64_t xy = 0;

    double sa = a[0];
    memcpy(&xa, &sa, sizeof(xa));

    int ea = (xa >> 52) & 0x7FF;
    if (0 == ea) { sa = 0.0; }
    else if (0x7FF == ea && (xa & 0x000F'FFFF'FFFF'FFFF)) {
        xy = xa & 0x7FFF'FFFF'FFFF'FFFF;
        memcpy(&y[0], &xy, sizeof(y[0]));
        return;
    }

    double sy = 1.0 / sa;
    memcpy(&xy, &sy, sizeof(xy));
    uint64_t xy_high = (xy >> 32);

    if (((xy >> 52) & 0x7FF) == 0) { xy_high = 0; }
    xy = (xy_high << 32);

    xy |= (xa & 0x8000'0000'0000'0000);
    memcpy(&sy, &xy, sizeof(sy));
    y[0] = sy;
}

Hi, @zettai-reido I updated the patch to apply 'ftz/daz' to the implementation and the results are:

    0x1.0000000000000p+1 =>     0x1.0000000000000p-1
    0x1.8000000000000p+1 =>     0x1.5555500000000p-2
    0x1.4000000000000p+2 =>     0x1.9999900000000p-3
    0x1.c000000000000p+2 =>     0x1.2492400000000p-3
    0x1.6000000000000p+3 =>     0x1.745d100000000p-4
    0x1.a000000000000p+3 =>     0x1.3b13b00000000p-4
    0x1.1000000000000p+4 =>     0x1.e1e1e00000000p-5
    0x1.3000000000000p+4 =>     0x1.af28600000000p-5
                     inf =>     0x0.0000000000000p+0
                    -inf =>    -0x0.0000000000000p+0
                     nan =>                      nan
                    -nan =>                      nan
 0x1.fffffffffffffp+1023 =>     0x0.0000000000000p+0
-0x1.fffffffffffffp+1023 =>    -0x0.0000000000000p+0
 0x1.0000000000000p-1022 =>  0x1.0000000000000p+1022
 0x1.eeeeeeeeeeeeep-1022 =>  0x1.08d3d00000000p+1021
 0x0.fffffffffffffp-1022 =>                      inf
 0x0.0000000000001p-1022 =>                      inf
-0x1.0000000000000p-1022 => -0x1.0000000000000p+1022
-0x0.fffffffffffffp-1022 =>                     -inf
-0x0.0000000000001p-1022 =>                     -inf
    0x1.0000000000000p+0 =>     0x1.0000000000000p+0
   -0x1.0000000000000p+0 =>    -0x1.0000000000000p+0
    0x1.8000000000000p+0 =>     0x1.5555500000000p-1
   -0x1.8000000000000p+0 =>    -0x1.5555500000000p-1

Feb 20 '24 06:02 jinge90

Hi, @intel/dpcpp-tools-reviewers , @intel/llvm-reviewers-runtime and @aelovikov-intel Could you help review this patch? Thanks very much.

Feb 20 '24 07:02 jinge90

Hi, @intel/dpcpp-tools-reviewers Could you help review this patch? Thanks very much.

Feb 21 '24 02:02 jinge90

Hi, @intel/dpcpp-tools-reviewers Kind ping~. Thanks very much.

Feb 23 '24 06:02 jinge90

Hi, @intel/dpcpp-tools-reviewers Kind ping~. Thanks very much.

Feb 26 '24 13:02 jinge90

llvm llvm copied to clipboard

[SYCL] Add __imf_rcp64h to intel math libdevice

llvm
llvm copied to clipboard