hcc icon indicating copy to clipboard operation
hcc copied to clipboard

Inline assembly of DS_WRITE_B128 compile error Don't know how to handle indirect register inputs yet for constraint 'v'

Open terU3760 opened this issue 3 years ago • 1 comments

By referencing here, wrote the following inline assembly code:

inline __device__ void sts(uint32_t ptr, uint4 val) {
    asm volatile("DS_WRITE_B128 %0, %1;\n" : : "v"(ptr) , "v"(val));
}

But when compile it reports the following error:

XXXXXXXXXXXX: error: Don't know how to handle indirect register inputs yet for constraint 'v'
    asm volatile("DS_WRITE_B128 %0, %1;\n" : : "v"(ptr) , "v"(val));
                         ^

The hip file is something as

#pragma once

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#include "hip/hip_runtime.h"
#define HIP_ENABLE_PRINTF

inline __device__ void sts_uint4(uint32_t ptr, uint4 val) {
    asm volatile("DS_WRITE_B128 %0, %1;\n" : : "v"(ptr) , "v"(val));
}

inline __device__ void lds_uint4(uint4 &dst, uint32_t ptr) {
    uint4 temp;
    asm volatile("DS_READ_B128 %0, %1;\n"
        : "=v"(temp)
        :  "r"(ptr));
    dst.x = temp.x;
    dst.y = temp.y;
    dst.z = temp.z;
    dst.w = temp.w;
}

static inline __device__ void v_pk_sts_uint4(uint32_t ptr, uint4 val) {
    printf("Store value: %d and %d and %d and %d into local shared memory address: %d.\n", val.x , val.y , val.z , val.w , ptr );
    sts_uint4( ptr , val );
}

static inline __device__ void v_pk_lds_uint4(uint4 &dst, uint32_t ptr) {
    uint4 tmp_loaded;
    lds_uint4( tmp_loaded , ptr );
    printf("Load value: %d and %d and %d and %d from local shared memory address: %d.\n", tmp_loaded.x, tmp_loaded.y , tmp_loaded.z , tmp_loaded.w , ptr );
    dst = tmp_loaded;
}

The cpp file is something as

#include <assert.h>
#include <stdint.h>
#include <stdlib.h>

#include "test_cpp_inline_asm_sts_lds_uint32_t.h"

#include "hip/hip_hcc.h"

#include "test_simple_kernel.hip"

int main(int argc,char**vargs)
{

    uint4* to_be_stored = new uint4[1];
    uint4* to_be_loaded = new uint4[1];
    uint32_t* local_share_memory_address = new uint32_t[1];

    uint4* to_be_stored_d;
    uint4* to_be_loaded_d;
    uint32_t* local_share_memory_address_d;

    to_be_stored[ 0 ].x = 1;
    to_be_stored[ 0 ].y = 2;
    to_be_stored[ 0 ].z = 19;
    to_be_stored[ 0 ].w = 20;
    local_share_memory_address[ 0 ] = 0;
    hipSetDevice(0);
    hipMalloc(&to_be_stored_d, 16);
    hipMalloc(&to_be_loaded_d, 16);
    hipMalloc(&local_share_memory_address_d, 4);
    hipMemcpy(to_be_stored_d, to_be_stored, 16, hipMemcpyHostToDevice);
    hipMemcpy(local_share_memory_address_d, local_share_memory_address, 4, hipMemcpyHostToDevice);
    printf("Stored value: %d and %d and %d and %d at local share memory address: %d.\n" , to_be_stored[ 0 ].x , to_be_stored[ 0 ].y , to_be_stored[ 0 ].z , to_be_stored[ 0 ].w , local_share_memory_address[ 0 ] );
    hipLaunchKernelGGL( halfVec_v_pk_sts_then_lds_uint4 , dim3( 1 ) , dim3( 1 ) , sizeof(uint4)*5 , 0, (uint4*)(to_be_loaded_d), (uint32_t*)(local_share_memory_address_d) , (uint4*)(to_be_stored_d) , 1 );
    hipMemcpy(to_be_loaded, to_be_loaded_d, 16, hipMemcpyDeviceToHost);
    hipDeviceSynchronize();
    printf("And then load value: %d and %d and %d and %d from local share memory address: %d.\n", to_be_loaded[ 0 ].x, to_be_loaded[ 0 ].y , to_be_loaded[ 0 ].z , to_be_loaded[ 0 ].w , local_share_memory_address[ 0 ] );

}

Thanks in advance!

terU3760 avatar Apr 01 '22 22:04 terU3760

The type declaration of the register needs to use this “typedef float Float4 attribute((ext_vector_type(4)));”

fileaccent avatar Aug 06 '23 05:08 fileaccent