HIP HIP API calls fail with "hipErrorNoBinaryForGpu" in hybrid host/device binaries

It seems there is an issue when device and host code are in the same binary, i.e., when trying to use the runtime API. To reproduce:

// main.cu
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>

#include <iostream>
#include <stdexcept>
#include <cstdio>

#ifdef RT_API
__global__ void foo()
{
	unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
	if(tid == 0)
		printf("kernel foo launched successfully\n");
}
#endif

namespace
{
	void succeed(hipError_t e)
	{
		if(e != hipSuccess)
			throw std::runtime_error(hipGetErrorName(e));
	}
}
int main(int argc, char** argv)
{
	try
	{
		try
		{
			succeed(hipInit(0));
		}
		catch(std::runtime_error const& e)
		{
			std::cout << "hipInit error: " << e.what() << std::endl;
		}
		catch(...)
		{
			std::rethrow_exception(std::current_exception());
		}

#ifdef RT_API
		try
		{
			hipLaunchKernelGGL(foo, dim3(1), dim3(32), 0, 0);
			succeed(hipDeviceSynchronize());
		}
		catch (std::runtime_error const& e)
		{
			std::cout << "Error: " << e.what() << std::endl;
		}
		catch (...)
		{
			std::rethrow_exception(std::current_exception());
		}
#else
		try
		{
			hipModule_t hip_module;
			succeed(hipModuleLoad(&hip_module, "kernels.hipfb"));

			hipFunction_t foo;
			succeed(hipModuleGetFunction(&foo, hip_module, "foo"));
			
			succeed(hipModuleLaunchKernel(foo, 1, 1, 1, 32, 1, 1, 0, 0, nullptr, nullptr));
			succeed(hipDeviceSynchronize());
		
			succeed(hipModuleUnload(hip_module));
		}
		catch(std::runtime_error const& e)
		{
			std::cout << "Error: " << e.what() << std::endl;
		}
		catch(...)
		{
			std::rethrow_exception(std::current_exception());
		}
#endif		
	}
	catch(std::exception const& e)
	{
		std::cout << "Unexpected exception: " << e.what() << std::endl;
	}	
	return 0;
}

Compile & run:

hipcc -DRT_API hipGraph.cpp -o hipGraphTest
./main

Output:

"hipErrorNoBinaryForGpu: Unable to find code object for all current devices!"
Aborted (core dumped)

Everything works fine when moving the kernel into a separate file, compiling into a module and using the driver API to launch it:

// kernels.cu
#include <hip/hip_runtime.h>
#include <cstdio>

extern "C" __global__ void foo()
{
	unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
	if(tid == 0)
		printf("kernel foo launched successfully\n");
}

Compile & run:

hipcc hipGraph.cpp -o hipGraphTest
hipcc kernels.cu --genco -o kernels.hipfb
./main

Output:

kernel foo launched successfully

My sytem information: hipconfig output:

HIP version  : 5.2.21151-afdc89f8

== hipconfig
HIP_PATH     : /opt/rocm-5.2.0
ROCM_PATH    : /opt/rocm-5.2.0
HIP_COMPILER : clang
HIP_PLATFORM : amd
HIP_RUNTIME  : rocclr
CPP_CONFIG   :  -D__HIP_PLATFORM_HCC__= -D__HIP_PLATFORM_AMD__= -I/opt/rocm-5.2.0/include -I/opt/rocm-5.2.0/llvm/bin/../lib/clang/14.0.0 -I/opt/rocm-5.2.0/hsa/include

== hip-clang
HSA_PATH         : /opt/rocm-5.2.0/hsa
HIP_CLANG_PATH   : /opt/rocm-5.2.0/llvm/bin
AMD clang version 14.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.2.0 22204 50d6d5d5b608d2abd6af44314abc6ad20036af3b)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/rocm-5.2.0/llvm/bin
AMD LLVM version 14.0.0git
  Optimized build.
  Default target: x86_64-unknown-linux-gnu
  Host CPU: znver1

  Registered Targets:
    amdgcn - AMD GCN GPUs
    r600   - AMD GPUs HD2XXX-HD6XXX
    x86    - 32-bit X86: Pentium-Pro and above
    x86-64 - 64-bit X86: EM64T and AMD64
hip-clang-cxxflags :  -std=c++11 -isystem "/opt/rocm-5.2.0/llvm/lib/clang/14.0.0/include/.." -isystem /opt/rocm-5.2.0/hsa/include -isystem "/opt/rocm-5.2.0/include" -O3
hip-clang-ldflags  :  -L"/opt/rocm-5.2.0/lib" -O3 -lgcc_s -lgcc -lpthread -lm -lrt

=== Environment Variables
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin

== Linux Kernel
Hostname     : daniel-G5-5505
Linux daniel-G5-5505 5.11.0-40-generic #44~20.04.2-Ubuntu SMP Tue Oct 26 18:07:44 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 20.04.4 LTS
Release:	20.04
Codename:	focal

rocminfo output:

ROCk module is loaded
=====================    
HSA System Attributes    
=====================    
Runtime Version:         1.1
System Timestamp Freq.:  1000.000000MHz
Sig. Max Wait Duration:  18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model:           LARGE                              
System Endianness:       LITTLE                             

==========               
HSA Agents               
==========               
*******                  
Agent 1                  
*******                  
  Name:                    AMD Ryzen 7 4800H with Radeon Graphics
  Uuid:                    CPU-XX                             
  Marketing Name:          AMD Ryzen 7 4800H with Radeon Graphics
  Vendor Name:             CPU                                
  Feature:                 None specified                     
  Profile:                 FULL_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        0(0x0)                             
  Queue Min Size:          0(0x0)                             
  Queue Max Size:          0(0x0)                             
  Queue Type:              MULTI                              
  Node:                    0                                  
  Device Type:             CPU                                
  Cache Info:              
    L1:                      32768(0x8000) KB                   
  Chip ID:                 0(0x0)                             
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   2900                               
  BDFID:                   0                                  
  Internal Node ID:        0                                  
  Compute Unit:            16                                 
  SIMDs per CU:            0                                  
  Shader Engines:          0                                  
  Shader Arrs. per Eng.:   0                                  
  WatchPts on Addr. Ranges:1                                  
  Features:                None
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: FINE GRAINED        
      Size:                    15734000(0xf014f0) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED
      Size:                    15734000(0xf014f0) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 3                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    15734000(0xf014f0) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
  ISA Info:                
*******                  
Agent 2                  
*******                  
  Name:                    gfx1010                            
  Uuid:                    GPU-XX                             
  Marketing Name:          AMD Radeon RX 5600M                
  Vendor Name:             AMD                                
  Feature:                 KERNEL_DISPATCH                    
  Profile:                 BASE_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        128(0x80)                          
  Queue Min Size:          64(0x40)                           
  Queue Max Size:          131072(0x20000)                    
  Queue Type:              MULTI                              
  Node:                    1                                  
  Device Type:             GPU                                
  Cache Info:              
    L1:                      16(0x10) KB                        
    L2:                      4096(0x1000) KB                    
  Chip ID:                 29471(0x731f)                      
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   1750                               
  BDFID:                   768                                
  Internal Node ID:        1                                  
  Compute Unit:            36                                 
  SIMDs per CU:            2                                  
  Shader Engines:          4                                  
  Shader Arrs. per Eng.:   2                                  
  WatchPts on Addr. Ranges:4                                  
  Features:                KERNEL_DISPATCH 
  Fast F16 Operation:      TRUE                               
  Wavefront Size:          32(0x20)                           
  Workgroup Max Size:      1024(0x400)                        
  Workgroup Max Size per Dimension:
    x                        1024(0x400)                        
    y                        1024(0x400)                        
    z                        1024(0x400)                        
  Max Waves Per CU:        40(0x28)                           
  Max Work-item Per CU:    1280(0x500)                        
  Grid Max Size:           4294967295(0xffffffff)             
  Grid Max Size per Dimension:
    x                        4294967295(0xffffffff)             
    y                        4294967295(0xffffffff)             
    z                        4294967295(0xffffffff)             
  Max fbarriers/Workgrp:   32                                 
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    6275072(0x5fc000) KB               
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 2                   
      Segment:                 GROUP                              
      Size:                    64(0x40) KB                        
      Allocatable:             FALSE                              
      Alloc Granule:           0KB                                
      Alloc Alignment:         0KB                                
      Accessible by all:       FALSE                              
  ISA Info:                
    ISA 1                    
      Name:                    amdgcn-amd-amdhsa--gfx1010:xnack-  
      Machine Models:          HSA_MACHINE_MODEL_LARGE            
      Profiles:                HSA_PROFILE_BASE                   
      Default Rounding Mode:   NEAR                               
      Default Rounding Mode:   NEAR                               
      Fast f16:                TRUE                               
      Workgroup Max Size:      1024(0x400)                        
      Workgroup Max Size per Dimension:
        x                        1024(0x400)                        
        y                        1024(0x400)                        
        z                        1024(0x400)                        
      Grid Max Size:           4294967295(0xffffffff)             
      Grid Max Size per Dimension:
        x                        4294967295(0xffffffff)             
        y                        4294967295(0xffffffff)             
        z                        4294967295(0xffffffff)             
      FBarrier Max Size:       32                                 
*******                  
Agent 3                  
*******                  
  Name:                    gfx90c                             
  Uuid:                    GPU-XX                             
  Marketing Name:                                             
  Vendor Name:             AMD                                
  Feature:                 KERNEL_DISPATCH                    
  Profile:                 BASE_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        128(0x80)                          
  Queue Min Size:          64(0x40)                           
  Queue Max Size:          131072(0x20000)                    
  Queue Type:              MULTI                              
  Node:                    2                                  
  Device Type:             GPU                                
  Cache Info:              
    L1:                      16(0x10) KB                        
    L2:                      1024(0x400) KB                     
  Chip ID:                 5686(0x1636)                       
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   1600                               
  BDFID:                   1792                               
  Internal Node ID:        2                                  
  Compute Unit:            27                                 
  SIMDs per CU:            4                                  
  Shader Engines:          2                                  
  Shader Arrs. per Eng.:   2                                  
  WatchPts on Addr. Ranges:4                                  
  Features:                KERNEL_DISPATCH 
  Fast F16 Operation:      TRUE                               
  Wavefront Size:          64(0x40)                           
  Workgroup Max Size:      1024(0x400)                        
  Workgroup Max Size per Dimension:
    x                        1024(0x400)                        
    y                        1024(0x400)                        
    z                        1024(0x400)                        
  Max Waves Per CU:        40(0x28)                           
  Max Work-item Per CU:    2560(0xa00)                        
  Grid Max Size:           4294967295(0xffffffff)             
  Grid Max Size per Dimension:
    x                        4294967295(0xffffffff)             
    y                        4294967295(0xffffffff)             
    z                        4294967295(0xffffffff)             
  Max fbarriers/Workgrp:   32                                 
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    524288(0x80000) KB                 
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 2                   
      Segment:                 GROUP                              
      Size:                    64(0x40) KB                        
      Allocatable:             FALSE                              
      Alloc Granule:           0KB                                
      Alloc Alignment:         0KB                                
      Accessible by all:       FALSE                              
  ISA Info:                
    ISA 1                    
      Name:                    amdgcn-amd-amdhsa--gfx90c:xnack-   
      Machine Models:          HSA_MACHINE_MODEL_LARGE            
      Profiles:                HSA_PROFILE_BASE                   
      Default Rounding Mode:   NEAR                               
      Default Rounding Mode:   NEAR                               
      Fast f16:                TRUE                               
      Workgroup Max Size:      1024(0x400)                        
      Workgroup Max Size per Dimension:
        x                        1024(0x400)                        
        y                        1024(0x400)                        
        z                        1024(0x400)                        
      Grid Max Size:           4294967295(0xffffffff)             
      Grid Max Size per Dimension:
        x                        4294967295(0xffffffff)             
        y                        4294967295(0xffffffff)             
        z                        4294967295(0xffffffff)             
      FBarrier Max Size:       32                                 
*** Done ***

Jul 14 '22 08:07 mlakardaniel

Does it happen if you compile with an explicit --offload-arch=gfxNNN --offload-arch=gfxMMM ... ?

I recommend always being explicit about what GPUs you want to target when not using the runtime compiler.

Jul 14 '22 14:07 b-sumner

Thanks for your reply. Same old, unfortunately. I explicitly compiled for all archs that are in the (working) module/offload-bundle but still get the same error.

Jul 14 '22 15:07 mlakardaniel

@mlakardaniel Apologies for the lack of response. Can you please test with latest ROCm 6.0.2 (HIP 6.0.32831)? If resolved, please close ticket. Thanks!

Apr 09 '24 15:04 ppanchad-amd