HIP API calls fail with "hipErrorNoBinaryForGpu" in hybrid host/device binaries
It seems there is an issue when device and host code are in the same binary, i.e., when trying to use the runtime API. To reproduce:
// main.cu
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#include <iostream>
#include <stdexcept>
#include <cstdio>
#ifdef RT_API
__global__ void foo()
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid == 0)
printf("kernel foo launched successfully\n");
}
#endif
namespace
{
void succeed(hipError_t e)
{
if(e != hipSuccess)
throw std::runtime_error(hipGetErrorName(e));
}
}
int main(int argc, char** argv)
{
try
{
try
{
succeed(hipInit(0));
}
catch(std::runtime_error const& e)
{
std::cout << "hipInit error: " << e.what() << std::endl;
}
catch(...)
{
std::rethrow_exception(std::current_exception());
}
#ifdef RT_API
try
{
hipLaunchKernelGGL(foo, dim3(1), dim3(32), 0, 0);
succeed(hipDeviceSynchronize());
}
catch (std::runtime_error const& e)
{
std::cout << "Error: " << e.what() << std::endl;
}
catch (...)
{
std::rethrow_exception(std::current_exception());
}
#else
try
{
hipModule_t hip_module;
succeed(hipModuleLoad(&hip_module, "kernels.hipfb"));
hipFunction_t foo;
succeed(hipModuleGetFunction(&foo, hip_module, "foo"));
succeed(hipModuleLaunchKernel(foo, 1, 1, 1, 32, 1, 1, 0, 0, nullptr, nullptr));
succeed(hipDeviceSynchronize());
succeed(hipModuleUnload(hip_module));
}
catch(std::runtime_error const& e)
{
std::cout << "Error: " << e.what() << std::endl;
}
catch(...)
{
std::rethrow_exception(std::current_exception());
}
#endif
}
catch(std::exception const& e)
{
std::cout << "Unexpected exception: " << e.what() << std::endl;
}
return 0;
}
Compile & run:
hipcc -DRT_API hipGraph.cpp -o hipGraphTest
./main
Output:
"hipErrorNoBinaryForGpu: Unable to find code object for all current devices!"
Aborted (core dumped)
Everything works fine when moving the kernel into a separate file, compiling into a module and using the driver API to launch it:
// kernels.cu
#include <hip/hip_runtime.h>
#include <cstdio>
extern "C" __global__ void foo()
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid == 0)
printf("kernel foo launched successfully\n");
}
Compile & run:
hipcc hipGraph.cpp -o hipGraphTest
hipcc kernels.cu --genco -o kernels.hipfb
./main
Output:
kernel foo launched successfully
My sytem information: hipconfig output:
HIP version : 5.2.21151-afdc89f8
== hipconfig
HIP_PATH : /opt/rocm-5.2.0
ROCM_PATH : /opt/rocm-5.2.0
HIP_COMPILER : clang
HIP_PLATFORM : amd
HIP_RUNTIME : rocclr
CPP_CONFIG : -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.2.0/include -I/opt/rocm-5.2.0/llvm/bin/../lib/clang/14.0.0 -I/opt/rocm-5.2.0/hsa/include
== hip-clang
HSA_PATH : /opt/rocm-5.2.0/hsa
HIP_CLANG_PATH : /opt/rocm-5.2.0/llvm/bin
AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.2.0 22204 50d6d5d5b608d2abd6af44314abc6ad20036af3b)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/rocm-5.2.0/llvm/bin
AMD LLVM version 14.0.0git
Optimized build.
Default target: x86_64-unknown-linux-gnu
Host CPU: znver1
Registered Targets:
amdgcn - AMD GCN GPUs
r600 - AMD GPUs HD2XXX-HD6XXX
x86 - 32-bit X86: Pentium-Pro and above
x86-64 - 64-bit X86: EM64T and AMD64
hip-clang-cxxflags : -std=c++11 -isystem "/opt/rocm-5.2.0/llvm/lib/clang/14.0.0/include/.." -isystem /opt/rocm-5.2.0/hsa/include -isystem "/opt/rocm-5.2.0/include" -O3
hip-clang-ldflags : -L"/opt/rocm-5.2.0/lib" -O3 -lgcc_s -lgcc -lpthread -lm -lrt
=== Environment Variables
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
== Linux Kernel
Hostname : daniel-G5-5505
Linux daniel-G5-5505 5.11.0-40-generic #44~20.04.2-Ubuntu SMP Tue Oct 26 18:07:44 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 20.04.4 LTS
Release: 20.04
Codename: focal
rocminfo output:
ROCk module is loaded
=====================
HSA System Attributes
=====================
Runtime Version: 1.1
System Timestamp Freq.: 1000.000000MHz
Sig. Max Wait Duration: 18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model: LARGE
System Endianness: LITTLE
==========
HSA Agents
==========
*******
Agent 1
*******
Name: AMD Ryzen 7 4800H with Radeon Graphics
Uuid: CPU-XX
Marketing Name: AMD Ryzen 7 4800H with Radeon Graphics
Vendor Name: CPU
Feature: None specified
Profile: FULL_PROFILE
Float Round Mode: NEAR
Max Queue Number: 0(0x0)
Queue Min Size: 0(0x0)
Queue Max Size: 0(0x0)
Queue Type: MULTI
Node: 0
Device Type: CPU
Cache Info:
L1: 32768(0x8000) KB
Chip ID: 0(0x0)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 2900
BDFID: 0
Internal Node ID: 0
Compute Unit: 16
SIMDs per CU: 0
Shader Engines: 0
Shader Arrs. per Eng.: 0
WatchPts on Addr. Ranges:1
Features: None
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: FINE GRAINED
Size: 15734000(0xf014f0) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 2
Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED
Size: 15734000(0xf014f0) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
Pool 3
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 15734000(0xf014f0) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Alignment: 4KB
Accessible by all: TRUE
ISA Info:
*******
Agent 2
*******
Name: gfx1010
Uuid: GPU-XX
Marketing Name: AMD Radeon RX 5600M
Vendor Name: AMD
Feature: KERNEL_DISPATCH
Profile: BASE_PROFILE
Float Round Mode: NEAR
Max Queue Number: 128(0x80)
Queue Min Size: 64(0x40)
Queue Max Size: 131072(0x20000)
Queue Type: MULTI
Node: 1
Device Type: GPU
Cache Info:
L1: 16(0x10) KB
L2: 4096(0x1000) KB
Chip ID: 29471(0x731f)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 1750
BDFID: 768
Internal Node ID: 1
Compute Unit: 36
SIMDs per CU: 2
Shader Engines: 4
Shader Arrs. per Eng.: 2
WatchPts on Addr. Ranges:4
Features: KERNEL_DISPATCH
Fast F16 Operation: TRUE
Wavefront Size: 32(0x20)
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Max Waves Per CU: 40(0x28)
Max Work-item Per CU: 1280(0x500)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 4294967295(0xffffffff)
y 4294967295(0xffffffff)
z 4294967295(0xffffffff)
Max fbarriers/Workgrp: 32
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 6275072(0x5fc000) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 2
Segment: GROUP
Size: 64(0x40) KB
Allocatable: FALSE
Alloc Granule: 0KB
Alloc Alignment: 0KB
Accessible by all: FALSE
ISA Info:
ISA 1
Name: amdgcn-amd-amdhsa--gfx1010:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 4294967295(0xffffffff)
y 4294967295(0xffffffff)
z 4294967295(0xffffffff)
FBarrier Max Size: 32
*******
Agent 3
*******
Name: gfx90c
Uuid: GPU-XX
Marketing Name:
Vendor Name: AMD
Feature: KERNEL_DISPATCH
Profile: BASE_PROFILE
Float Round Mode: NEAR
Max Queue Number: 128(0x80)
Queue Min Size: 64(0x40)
Queue Max Size: 131072(0x20000)
Queue Type: MULTI
Node: 2
Device Type: GPU
Cache Info:
L1: 16(0x10) KB
L2: 1024(0x400) KB
Chip ID: 5686(0x1636)
Cacheline Size: 64(0x40)
Max Clock Freq. (MHz): 1600
BDFID: 1792
Internal Node ID: 2
Compute Unit: 27
SIMDs per CU: 4
Shader Engines: 2
Shader Arrs. per Eng.: 2
WatchPts on Addr. Ranges:4
Features: KERNEL_DISPATCH
Fast F16 Operation: TRUE
Wavefront Size: 64(0x40)
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Max Waves Per CU: 40(0x28)
Max Work-item Per CU: 2560(0xa00)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 4294967295(0xffffffff)
y 4294967295(0xffffffff)
z 4294967295(0xffffffff)
Max fbarriers/Workgrp: 32
Pool Info:
Pool 1
Segment: GLOBAL; FLAGS: COARSE GRAINED
Size: 524288(0x80000) KB
Allocatable: TRUE
Alloc Granule: 4KB
Alloc Alignment: 4KB
Accessible by all: FALSE
Pool 2
Segment: GROUP
Size: 64(0x40) KB
Allocatable: FALSE
Alloc Granule: 0KB
Alloc Alignment: 0KB
Accessible by all: FALSE
ISA Info:
ISA 1
Name: amdgcn-amd-amdhsa--gfx90c:xnack-
Machine Models: HSA_MACHINE_MODEL_LARGE
Profiles: HSA_PROFILE_BASE
Default Rounding Mode: NEAR
Default Rounding Mode: NEAR
Fast f16: TRUE
Workgroup Max Size: 1024(0x400)
Workgroup Max Size per Dimension:
x 1024(0x400)
y 1024(0x400)
z 1024(0x400)
Grid Max Size: 4294967295(0xffffffff)
Grid Max Size per Dimension:
x 4294967295(0xffffffff)
y 4294967295(0xffffffff)
z 4294967295(0xffffffff)
FBarrier Max Size: 32
*** Done ***
Does it happen if you compile with an explicit --offload-arch=gfxNNN --offload-arch=gfxMMM ... ?
I recommend always being explicit about what GPUs you want to target when not using the runtime compiler.
Thanks for your reply. Same old, unfortunately. I explicitly compiled for all archs that are in the (working) module/offload-bundle but still get the same error.
@mlakardaniel Apologies for the lack of response. Can you please test with latest ROCm 6.0.2 (HIP 6.0.32831)? If resolved, please close ticket. Thanks!