HIP icon indicating copy to clipboard operation
HIP copied to clipboard

hip inline assembly

Open ahatstat opened this issue 3 years ago • 9 comments

What is the Hip-Clang equivalent of this CUDA function?

__device__ __forceinline__ uint32_t add_cc(uint32_t a, uint32_t b)
{
   uint32_t r;
   asm volatile ("add.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b));
   return r;
}

I'm porting a CUDA project to HIP-Clang that contains inline PTX assembly. The function is used to implement multi-precision addition in the NVIDIA GPU. I tried:

asm volatile ("add.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b)); //invalid instruction
asm volatile ("V_ADD_CO_U32 %0, %1, %2;" : "=r"(r) : "r"(a), "r"(b)); //invalid operand for instruction
asm volatile ("V_ADD_CO_U32 %0, %1, %2;" : "=v"(r) : "v"(a), "v"(b)); //operands are not valid for this GPU or mode

The target hardware is RX 6800. AMD clang version 14.0.0.
Is RDNA2 the correct instruction set reference?
Is this LLVM user guide to AMDGPU backend an applicable reference? This question is also on stackoverflow.

ahatstat avatar Mar 22 '22 13:03 ahatstat

You might want to check https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/2_Cookbook/10_inline_asm , it is explaining things a bit better in my opinion. As for your issue, maybe you can check in generated assembly for 64 bit integer additions what the assembly looks like? Any array accessed with an offset should show the patter. Maybe you need to indicate vcc as an additional operand?

Epliz avatar Mar 23 '22 06:03 Epliz

Also, from my experience, marking with volatile prevents some optimisations, and if possible you should remove it.

Epliz avatar Mar 23 '22 07:03 Epliz

@ahatstat , for me, what you indicated last, i.e.

asm volatile ("V_ADD_CO_U32 %0, %1, %2;" : "=v"(r) : "v"(a), "v"(b));

Works as long as you apply it on operands that are in VGPRs. If they are in SGPRs, it doesn't. To check if that's the case for you too, you can check if it works when you apply it on something in VGPRs (threadIdx.x is in vgpr for example), vs when it is in SGPRs (blockIdx.x is in sgpr for example).

Epliz avatar Mar 23 '22 14:03 Epliz

@Epliz I think you are right about needing to use vcc. There are some clues here.
I would expect all the operands to be VGPRs, but perhaps vcc is SGPR, or perhaps the compiler is choosing to store the data in an SGPR?. Is it possible the compiler could choose an s register when I would expect a v, therefore making this difficult to know how to write this function?

ahatstat avatar Mar 23 '22 18:03 ahatstat

Here is a working example. The kernel add_128 uses regular c++ to implement multiprecision addition of two 128 bit numbers. The kernel add_128_asm should be equivalent, using inline assembly, but I can't get it to compile. If hip-clang compiler works like CUDA, inline assembly is needed for optimal performance. However, this might be an incorrect assumption.

  1. How can I view the assembly produced for the add_128 kernel?
  2. How do I fix the add_128_asm kernel so that it works?

main.cpp

#include <iostream>
#include <cstdint>
#include <random>
#include <vector>
#include "hip/hip_runtime.h"

#define N 100000
#define LIMBS 4

struct uint128
{
    uint32_t limbs[LIMBS];
};

__global__ void add_128(const uint128* a, const uint128* b, uint128* c) 
{
    int index = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    if (index >= N)
        return;

    uint8_t carry = 0;
    for (int i=0; i<LIMBS; i++)
    {
        uint64_t tmp = (uint64_t)a[index].limbs[i] + (uint64_t)b[index].limbs[i] + carry; 
        carry = tmp > 0xFFFFFFFF ? 1 : 0;
        c[index].limbs[i] = tmp;
    }
}

__global__ void add_128_asm(const uint128* a, const uint128* b, uint128* c) 
{
    int index = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    if (index >= N)
        return;

    //asm volatile ("V_ADD_CO_U32 %0, %1, %2" : "=v"(c[index].limbs[0]) : "v"(a[index].limbs[0]), "v"(b[index].limbs[0]));
    for (int i=1; i<LIMBS; i++)
    {
    //   asm volatile ("V_ADD_CO_CI_U32 %0, vcc, %1, %2, vcc;" : "=v"(c[index].limbs[i]) : "v"(a[index].limbs[i]), "v"(b[index].limbs[i]));
    }
}

int main()
{
    std::random_device rand_dev;
    std::mt19937 generator(rand_dev());
    std::uniform_int_distribution<uint32_t> distr(0, 0xFFFFFFFF);
  
    std::vector<uint128> hostA(N);
    std::vector<uint128> hostB(N);
    std::vector<uint128> hostC(N);

    uint128* deviceA;
    uint128* deviceB;
    uint128* deviceC;

    // initialize the input data
    for (int i=0; i<LIMBS; i++)
    {
        hostA[0].limbs[i] = 0xFFFFFFFF;
        hostB[0].limbs[i] = 0;
    }
    hostB[0].limbs[0] = 1;

    for (int i = 1; i < N; i++) 
    {
        for (int j=0; j < LIMBS; j++)
        {
            hostA[i].limbs[j] = distr(generator);
            hostB[i].limbs[j] = distr(generator);
        }
    }
  
    hipError_t hip_error = hipMalloc((void**)&deviceA, N * sizeof(uint128));
    hip_error = hipMalloc((void**)&deviceB, N * sizeof(uint128));
    hip_error = hipMalloc((void**)&deviceC, N * sizeof(uint128));
    
    hip_error = hipMemcpy(deviceA, hostA.data(), N*sizeof(uint128), hipMemcpyHostToDevice);
    hip_error = hipMemcpy(deviceB, hostB.data(), N*sizeof(uint128), hipMemcpyHostToDevice);

    int threads = 128;
    int blocks = (N + threads - 1)/(threads);
    hipLaunchKernelGGL(add_128, blocks, threads, 0, 0, deviceA, deviceB, deviceC);

    hip_error = hipMemcpy(hostC.data(), deviceC, N*sizeof(uint128), hipMemcpyDeviceToHost);

    // verify the results
    bool passed = true;
    for (int j=0; j < LIMBS; j++)
        passed &= hostC[0].limbs[j] == 0;
    
    for (int i = 0; i < N; i++) 
    {
        uint8_t carry = 0;
        for (int j=0; j<LIMBS; j++)
        {
            uint64_t tmp = (uint64_t)hostA[i].limbs[j] + (uint64_t)hostB[i].limbs[j] + carry; 
            carry = tmp > 0xFFFFFFFF ? 1 : 0;
            passed &= hostC[i].limbs[j] == (uint32_t)tmp;
            if (!passed)
            {
                std::cout << "Failed at vector " << i << " " << hostC[i].limbs[j] << " vs " << (uint32_t)tmp << std::endl;
                break;
            }
        }
        
    }

    if (passed)
        std::cout << "PASSED" << std::endl;
    else
        std::cout << "FAILED" << std::endl;


    hip_error = hipFree(deviceA);
    hip_error = hipFree(deviceB);
    hip_error = hipFree(deviceC);

}

ahatstat avatar Mar 23 '22 21:03 ahatstat

@ahatstat , I don't have a navi2 gpu to confirm the following works, but at least it is compiling:

#include <iostream>
#include <cstdint>
#include <random>
#include <vector>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>

#define N 100000
#define LIMBS 4

struct uint128
{
    uint32_t limbs[LIMBS];
};

__global__ void add_128(const uint128* a, const uint128* b, uint128* c) 
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index >= N)
        return;

    uint8_t carry = 0;
    for (int i=0; i<LIMBS; i++)
    {
        uint64_t tmp = (uint64_t)a[index].limbs[i] + (uint64_t)b[index].limbs[i] + carry; 
        carry = tmp > 0xFFFFFFFF ? 1 : 0;
        c[index].limbs[i] = tmp;
    }
}

__global__ void add_128_asm(const uint128* a, const uint128* b, uint128* c) 
{
    int32_t index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index >= N)
        return;

    a = &a[index];
    b = &b[index];
    c = &c[index];
    
    uint32_t a_0 = a->limbs[0];
    uint32_t b_0 = b->limbs[0];
    uint32_t c_0;

    uint32_t a_1 = a->limbs[1];
    uint32_t b_1 = b->limbs[1];
    uint32_t c_1;
    
    uint32_t a_2 = a->limbs[2];
    uint32_t b_2 = b->limbs[2];
    uint32_t c_2;
    
    uint32_t a_3 = a->limbs[3];
    uint32_t b_3 = b->limbs[3];
    uint32_t c_3;

// using gfx macros as per https://github.com/ROCm-Developer-Tools/HIP/blob/develop/samples/2_Cookbook/14_gpu_arch/gpuarch.cpp
#if defined(__gfx900__)
   asm ("v_add_co_u32 %0, vcc, %1, %2;" : "=v"(c_0) : "v"(a_0), "v"(b_0): "vcc");
   asm ("v_addc_co_u32 %0, vcc, %1, %2, vcc;" : "=v"(c_1) : "v"(a_1), "v"(b_1): "vcc");
   asm ("v_addc_co_u32 %0, vcc, %1, %2, vcc;" : "=v"(c_2) : "v"(a_2), "v"(b_2): "vcc");
   asm ("v_addc_co_u32 %0, vcc, %1, %2, vcc;" : "=v"(c_3) : "v"(a_3), "v"(b_3): "vcc");
#elif defined(__gfx1030__)
// needs to use vcc_lo as it compiles with wavefron size 32
   asm ("v_add_co_u32 %0, vcc_lo, %1, %2;" : "=v"(c_0) : "v"(a_0), "v"(b_0): "vcc_lo");
   asm ("v_add_co_ci_u32 %0, %1, %2;" : "=v"(c_1) : "v"(a_1), "v"(b_1): "vcc");
   asm ("v_add_co_ci_u32 %0, %1, %2;" : "=v"(c_2) : "v"(a_2), "v"(b_2): "vcc");
   asm ("v_add_co_ci_u32 %0, %1, %2;" : "=v"(c_3) : "v"(a_3), "v"(b_3): "vcc");
#endif

    c->limbs[0] = c_0;
    c->limbs[1] = c_1;
    c->limbs[2] = c_2;
    c->limbs[3] = c_3;
}

int main()
{
    std::random_device rand_dev;
    std::mt19937 generator(rand_dev());
    std::uniform_int_distribution<uint32_t> distr(0, 0xFFFFFFFF);
  
    std::vector<uint128> hostA(N);
    std::vector<uint128> hostB(N);
    std::vector<uint128> hostC(N);

    uint128* deviceA;
    uint128* deviceB;
    uint128* deviceC;

    // initialize the input data
    for (int i=0; i<LIMBS; i++)
    {
        hostA[0].limbs[i] = 0xFFFFFFFF;
        hostB[0].limbs[i] = 0;
    }
    hostB[0].limbs[0] = 1;

    for (int i = 1; i < N; i++) 
    {
        for (int j=0; j < LIMBS; j++)
        {
            hostA[i].limbs[j] = distr(generator);
            hostB[i].limbs[j] = distr(generator);
        }
    }
  
    hipError_t hip_error = hipMalloc((void**)&deviceA, N * sizeof(uint128));
    hip_error = hipMalloc((void**)&deviceB, N * sizeof(uint128));
    hip_error = hipMalloc((void**)&deviceC, N * sizeof(uint128));
    
    hip_error = hipMemcpy(deviceA, hostA.data(), N*sizeof(uint128), hipMemcpyHostToDevice);
    hip_error = hipMemcpy(deviceB, hostB.data(), N*sizeof(uint128), hipMemcpyHostToDevice);

    int threads = 128;
    int blocks = (N + threads - 1)/(threads);
    add_128_asm<<<dim3(blocks), dim3(threads), 0, 0>>>(deviceA, deviceB, deviceC);

    hip_error = hipMemcpy(hostC.data(), deviceC, N*sizeof(uint128), hipMemcpyDeviceToHost);

    // verify the results
    bool passed = true;
    for (int j=0; j < LIMBS; j++)
        passed &= hostC[0].limbs[j] == 0;
    
    for (int i = 0; i < N; i++) 
    {
        uint8_t carry = 0;
        for (int j=0; j<LIMBS; j++)
        {
            uint64_t tmp = (uint64_t)hostA[i].limbs[j] + (uint64_t)hostB[i].limbs[j] + (uint64_t) carry; 
            carry = tmp > 0xFFFFFFFF ? 1 : 0;
            passed &= hostC[i].limbs[j] == (uint32_t)tmp;
            if (!passed)
            {
                std::cout << "Failed at vector " << i << " " << hostC[i].limbs[j] << " vs " << (uint32_t)tmp << std::endl;
                return -1;
            }
        }
        
    }

    if (passed)
        std::cout << "PASSED" << std::endl;
    else
        std::cout << "FAILED" << std::endl;


    hip_error = hipFree(deviceA);
    hip_error = hipFree(deviceB);
    hip_error = hipFree(deviceC);

}

For my vega card, I had to unroll your loops as the loads/stores in the loop seemed to mess with the carry in vcc. For the instruction syntax and knowing what the operands can be for the assembler, https://llvm.org/docs/AMDGPU/AMDGPUAsmGFX10.html is a good reference

Epliz avatar Mar 24 '22 08:03 Epliz

  1. How can I view the assembly produced for the add_128 kernel?

You can compile with -save-temps and check the dumped .s file.

yxsamliu avatar Mar 24 '22 15:03 yxsamliu

@Epliz thank you for your help. I was able to confirm that your code works with my __gfx1030__ hardware. I was also able to get it to work in a loop. Interestingly, it always works in a loop with or without pragma unroll, voltatile, debug or O3. Also interestingly the performance of the inline assembly vs. the standard code was about the same. I still need to inspect the assembly to see what is going on but this is a good sign. Here is the code I ended up with. I only tested it functionally works with __gfx1030__ hardware.

__global__ void add_128_asm(const uint128* a, const uint128* b, uint128* c) 
{
    int index = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
    if (index >= N)
        return;

    #if defined(__gfx900__)
    asm volatile ("v_add_co_u32 %0, vcc, %1, %2;" : "=v"(c[index].limbs[0]) : "v"(a[index].limbs[0]), "v"(b[index].limbs[0]));
    #pragma unroll
    for (int i=1; i<LIMBS; i++)
    {
        asm volatile ("v_addc_co_u32 %0, vcc, %1, %2, vcc;" : "=v"(c[index].limbs[i]) : "v"(a[index].limbs[i]), "v"(b[index].limbs[i]));
    }
    #elif defined(__gfx1030__)
    asm volatile ("v_add_co_u32 %0, vcc_lo, %1, %2;" : "=v"(c[index].limbs[0]) : "v"(a[index].limbs[0]), "v"(b[index].limbs[0]));
    #pragma unroll
    for (int i=1; i<LIMBS; i++)
    {
       asm volatile ("v_add_co_ci_u32 %0, %1, %2;" : "=v"(c[index].limbs[i]) : "v"(a[index].limbs[i]), "v"(b[index].limbs[i]));
    }
    #endif
}

ahatstat avatar Mar 24 '22 15:03 ahatstat

@yxsamliu thanks for the tip. As expected the assembly for the pure c++ version was not optimal. Instead of using the carry flags directly the compiler performed the 64 bit additions and comparisons. My inline ASM version using a loop was cleaner, but it still did a dword global read/store in the inner loop, which must limit the performance. @Epliz hand unrolled version using temporary variables was the best, as it did one global write at the end using the global_store_dwordx4 instruction. The execution time of the hand unrolled version is about 10% faster. The assembly for the pure c++ version:

v_add_co_u32 v6, s0, v6, v7
	v_add_co_ci_u32_e64 v7, s0, 0, 0, s0
	s_mov_b64 s[0:1], 0xffffffff
	global_store_dword v[0:1], v6, off
	global_load_dword v8, v[2:3], off offset:4
	global_load_dword v9, v[4:5], off offset:4
	v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[6:7]
	v_mov_b32_e32 v6, s1
	v_cndmask_b32_e64 v7, 0, 1, vcc_lo
	s_waitcnt vmcnt(1)
	v_add_co_u32 v7, vcc_lo, v7, v8
	v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
	s_waitcnt vmcnt(0)
	v_add_co_u32 v6, vcc_lo, v7, v9
	v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
	global_store_dword v[0:1], v6, off offset:4
	global_load_dword v8, v[2:3], off offset:8
	global_load_dword v9, v[4:5], off offset:8
	v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[6:7]
	v_mov_b32_e32 v6, s1
	v_cndmask_b32_e64 v7, 0, 1, vcc_lo
	s_waitcnt vmcnt(1)
	v_add_co_u32 v7, vcc_lo, v7, v8
	v_add_co_ci_u32_e32 v8, vcc_lo, 0, v6, vcc_lo
	s_waitcnt vmcnt(0)
	v_add_co_u32 v6, vcc_lo, v7, v9
	v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
	global_store_dword v[0:1], v6, off offset:8
	global_load_dword v2, v[2:3], off offset:12
	global_load_dword v3, v[4:5], off offset:12
	v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[6:7]
	s_waitcnt vmcnt(0)
	v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo
	global_store_dword v[0:1], v2, off offset:12

my looped version

global_load_dword v6, v[2:3], off
	global_load_dword v7, v[4:5], off
	v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
	s_waitcnt vmcnt(0)
	;;#ASMSTART
	v_add_co_u32 v6, vcc_lo, v6, v7;
	;;#ASMEND
	global_store_dword v[0:1], v6, off
	global_load_dword v6, v[2:3], off offset:4
	global_load_dword v7, v[4:5], off offset:4
	s_waitcnt vmcnt(0)
	;;#ASMSTART
	v_add_co_ci_u32 v6, v6, v7;
	;;#ASMEND
	global_store_dword v[0:1], v6, off offset:4
	global_load_dword v6, v[2:3], off offset:8
	global_load_dword v7, v[4:5], off offset:8
	s_waitcnt vmcnt(0)
	;;#ASMSTART
	v_add_co_ci_u32 v6, v6, v7;
	;;#ASMEND
	global_store_dword v[0:1], v6, off offset:8
	global_load_dword v2, v[2:3], off offset:12
	global_load_dword v3, v[4:5], off offset:12
	s_waitcnt vmcnt(0)
	;;#ASMSTART
	v_add_co_ci_u32 v2, v2, v3;
	;;#ASMEND
	global_store_dword v[0:1], v2, off offset:12

hand unrolled

	global_load_dwordx4 v[0:3], v[0:1], off
	global_load_dwordx4 v[4:7], v[4:5], off
	v_add_co_ci_u32_e32 v9, vcc_lo, s5, v9, vcc_lo
	s_waitcnt vmcnt(0)
	;;#ASMSTART
	v_add_co_u32 v0, vcc_lo, v0, v4;
	;;#ASMEND
	;;#ASMSTART
	v_add_co_ci_u32 v1, v1, v5;
	;;#ASMEND
	;;#ASMSTART
	v_add_co_ci_u32 v2, v2, v6;
	;;#ASMEND
	;;#ASMSTART
	v_add_co_ci_u32 v3, v3, v7;
	;;#ASMEND
	global_store_dwordx4 v[8:9], v[0:3], off

ahatstat avatar Mar 24 '22 16:03 ahatstat

@ahatstat Is this ticket still relevant? If not, please close. Thanks!

ppanchad-amd avatar Apr 03 '24 19:04 ppanchad-amd