rocprofiler
rocprofiler copied to clipboard
rocm profiler creates trace for 1 gpu only when kernels launched onto two separate kernels.
#include <stdio.h>
#include "hip/hip_runtime.h"
// 1. if N is set to up to 1024, then sum is OK.
// 2. Set N past the 1024 which is past No. of threads per blocks, and then all iterations of sum results in
// even the ones within the block.
// 3. To circumvent the problem described in 2. above, since if N goes past No. of threads per block, we need multiple block launch.
// The trick is describe in p65 to use formula (N+127) / 128 for blocknumbers so that when block number starts from 1, it is
// (1+127) / 128.
#define N 2048
#define N 536870912
#define MAX_THREAD_PER_BLOCK 1024
__global__ void add( int * a, int * b, int * c ) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x ;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main (void) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
int stepSize;
int count = 0;
hipGetDeviceCount(&count);
printf("\nDevice count: %d.", count);
if (count < 2) {
printf("No. of devices must be at least 2.");
return 1;
}
// allocate dev memory for N size for pointers declared earlier.
// allocate dev memory for N size for pointers declared earlier.
printf("\nAllocating memory...(size %u array size of INT).\n", N );
hipMalloc( (void**)&dev_a, N * sizeof(int));
hipMalloc( (void**)&dev_b, N * sizeof(int));
hipMalloc( (void**)&dev_c, N * sizeof(int));
const unsigned blocks = 512;
const unsigned threadsPerBlock = 256;
// invoke the kernel:
// block count: (N+127)/128
// thread count: 128
hipSetDevice(0);
hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c);
hipSetDevice(1);
hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c);
hipDeviceSynchronize();
hipFree(dev_a);
hipFree(dev_b);
hipFree(dev_c);
}
use following to compile and create trace:
FILE1=p61
for FILE in $FILE1 ; do
hipcc $FILE.cpp -o $FILE.out
rocprof --hip-trace -d ./$FILE ./$FILE.out
done
there is a result.json created and when opened in chrome tracer, only gpu0 is seen.
00:07.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Vega 10 [Radeon Instinct MI25 MxGPU] (rev 06)
00:08.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Vega 10 [Radeon Instinct MI25 MxGPU] (rev 06)
root@sriov-guest:~/dev-learn/gpu/hip/hip-stream-aql/p61-2gpus# egrep -irn gpu results.json
4:,{"args":{"name":"GPU0"},"ph":"M","pid":6,"name":"process_name","sort_index":2}
268: "Agent2.Uuid": "GPU-0215141e35aa2184",
269: "Agent2.MarketingName": "Vega10[RadeonInstinctMI25MxGPU]",
279: "Agent2.DeviceType": "GPU",
335: "Agent3.Uuid": "GPU-0215141e35aa2904",
336: "Agent3.MarketingName": "Vega10[RadeonInstinctMI25MxGPU]",
346: "Agent3.DeviceType": "GPU",
I suspect that the first kernel doesn't finish when the program exits.
hipSetDevice(0);
hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c); // launch on gpu 0
hipSetDevice(1);
hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c); // launch on gpu 1
hipDeviceSynchronize(); // wait for gpu 1
hipSetDevice(0); // new code
hipDeviceSynchronize(); // new code: wait for gpu 0
I haven't checked it, it's just a thought.
(By the way, recent HIP versions support CUDA's syntax kernel<<<...>>>(...))
then should not it show that exactly?? It shlould not be the reason for ignoring 2nd gpu?
Hi @gggh000, are you still encountering this issue when profiling with two GPUs? I was unable to reproduce this issue using your example program. Please see the output of egrep -irn gpu results.json below:
4:,{"args":{"name":"GPU0"},"ph":"M","pid":6,"name":"process_name","sort_index":2}
5:,{"args":{"name":"GPU Barriers0"},"ph":"M","pid":518,"name":"process_name","sort_index":3}
6:,{"args":{"name":"GPU1"},"ph":"M","pid":7,"name":"process_name","sort_index":4}
7:,{"args":{"name":"GPU Barriers1"},"ph":"M","pid":519,"name":"process_name","sort_index":5}
278: "Agent2.Uuid": "GPU-XXX",
289: "Agent2.DeviceType": "GPU",
361: "Agent3.Uuid": "GPU-XXX",
372: "Agent3.DeviceType": "GPU",
If you do still see this issue, could you please confirm if you are using a virtual machine and if you can reproduce the issue on baremetal as well.
Please also note that MI25 is no longer supported for ROCm. You can find the updated list of supported GPUs at the System requirements page.