HIP
HIP copied to clipboard
[Question] Penalty for using Array of Structs (AoS) and Struct of Array (SoA) paradigm?
Hello,
I wanted to understand a bit how exactly ROCm handles memory coalescing and I wanted to see the trade-offs between using an AoS versus an SoA layout, and the results are surprising. As a control, I have included a straight array implementation, and I am finding the straight array layout to be most efficient even though the straight array and SoA should be very similar in access pattern. When compiling on an NVIDIA device, I find that the times are near identical, but on an AMD MI50, the straight array case is about 2x faster than both SoA and AoS implementations. Is memory coalescing different on AMD architectures?
Here is a minimal reproducible example of adding 1.0 to an array of column vectors:
#include <iostream>
#include <chrono>
#include <vector>
#include <cassert>
#include "hip/hip_runtime.h"
using namespace std::chrono;
constexpr int nvars = 4;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(hipError_t code, const char *file, int line, bool abort=true)
{
if (code != hipSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct AoS {
double t, x, y, z;
__device__ AoS& operator+=(const AoS &vec) {
t += vec.t;
x += vec.x;
y += vec.y;
z += vec.z;
return *this;
};
};
struct SoA {
size_t size;
bool host_allocated, device_allocated;
double* t;
double* x;
double* y;
double* z;
double* gpu_t;
double* gpu_x;
double* gpu_y;
double* gpu_z;
SoA* device_ptr;
SoA(int elements) : size(elements * sizeof(double)), host_allocated(false), device_allocated(false) {};
~SoA(){
if (host_allocated){
free(t);
free(x);
free(y);
free(z);
}
if (device_allocated) {
hipFree(gpu_t);
hipFree(gpu_x);
hipFree(gpu_y);
hipFree(gpu_z);
hipFree(device_ptr);
}
}
void host_allocate() {
t = (double*)malloc(size);
x = (double*)malloc(size);
y = (double*)malloc(size);
z = (double*)malloc(size);
host_allocated = true;
};
void device_allocate() {
if (!host_allocated) {
host_allocate();
}
gpuErrchk(hipMalloc((void**)&device_ptr, sizeof(SoA)));
gpuErrchk(hipMalloc((void**)&gpu_t, size));
gpuErrchk(hipMalloc((void**)&gpu_x, size));
gpuErrchk(hipMalloc((void**)&gpu_y, size));
gpuErrchk(hipMalloc((void**)&gpu_z, size));
device_allocated = true;
};
void copy_to_device() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(hipMemcpy(gpu_t, t, size, hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(gpu_x, x, size, hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(gpu_y, y, size, hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(gpu_z, z, size, hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(&(device_ptr->t), &gpu_t, sizeof(double *), hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(&(device_ptr->x), &gpu_x, sizeof(double *), hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(&(device_ptr->y), &gpu_y, sizeof(double *), hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(&(device_ptr->z), &gpu_z, sizeof(double *), hipMemcpyHostToDevice));
}
void copy_to_host() {
if (!device_allocated) {
device_allocate();
}
gpuErrchk(hipMemcpy(t, gpu_t, size, hipMemcpyDeviceToHost));
gpuErrchk(hipMemcpy(x, gpu_x, size, hipMemcpyDeviceToHost));
gpuErrchk(hipMemcpy(y, gpu_y, size, hipMemcpyDeviceToHost));
gpuErrchk(hipMemcpy(z, gpu_z, size, hipMemcpyDeviceToHost));
}
SoA* get_ptr(){
if (device_allocated) {
return device_ptr;
}
return this;
}
};
__global__ void arr_add(double* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
#pragma unroll
for (int var = 0; var < nvars; var++) {
vec[var * n + ii] += 1.0;
}
};
__global__ void aos_add(AoS* vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec[ii] += AoS{1.0, 1.0, 1.0, 1.0};
};
__global__ void soa_add(SoA *vec, int n){
int ii = blockIdx.x * blockDim.x + threadIdx.x;
if (ii >= n)
return;
vec->t[ii] += 1.0;
vec->x[ii] += 1.0;
vec->y[ii] += 1.0;
vec->z[ii] += 1.0;
};
int main() {
constexpr int n = 1 << 25;
constexpr int block_size = 128;
high_resolution_clock::time_point t1, t2;
duration<double> dt1, dt2, dt3;
SoA hybrid_soa(n);
hybrid_soa.device_allocate();
hybrid_soa.copy_to_device();
std::vector<AoS> host_vec_aos(n);
std::vector<double> host_arr(n * nvars);
AoS *dev_aos;
double *dev_arr;
gpuErrchk(hipMalloc((void**)&dev_aos, n * sizeof(AoS)));
gpuErrchk(hipMalloc((void**)&dev_arr, n * nvars * sizeof(double)));
gpuErrchk(hipMemcpy(dev_aos, host_vec_aos.data(), n * sizeof(AoS), hipMemcpyHostToDevice));
gpuErrchk(hipMemcpy(dev_arr, host_arr.data(), n * nvars * sizeof(double), hipMemcpyHostToDevice));
const int nblocks = (n + block_size - 1) / block_size;
std::cout << "Size of AoS struct is " << sizeof(AoS) << " bytes" << "\n";
std::cout << "Size of SoA struct is " << sizeof(SoA) << " bytes" << "\n";\
t1 = high_resolution_clock::now();
arr_add<<<nblocks, block_size>>>(dev_arr, n);
gpuErrchk(hipDeviceSynchronize());
t2 = high_resolution_clock::now();
dt1 = t2 - t1;
std::cout << "SrA took: " << std::scientific << dt1.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
aos_add<<<nblocks, block_size>>>(dev_aos, n);
gpuErrchk(hipDeviceSynchronize());
t2 = high_resolution_clock::now();
dt2 = t2 - t1;
std::cout << "AoS took: " << std::scientific << dt2.count() << " seconds" << "\n";
t1 = high_resolution_clock::now();
soa_add<<<nblocks, block_size>>>(hybrid_soa.get_ptr(), n);
gpuErrchk(hipDeviceSynchronize());
t2 = high_resolution_clock::now();
dt3 = t2 - t1;
std::cout << "SoA took: " << std::scientific << dt3.count() << " seconds" << "\n";
gpuErrchk(hipMemcpy(host_vec_aos.data(), dev_aos, n * sizeof(AoS), hipMemcpyDeviceToHost));
gpuErrchk(hipMemcpy(host_arr.data(), dev_arr, n * nvars * sizeof(double), hipMemcpyDeviceToHost));
hybrid_soa.copy_to_host();
std::cout << "Straight array is: " << dt2.count() / dt1.count() << " times faster than AoS" << "\n";
std::cout << "Straight array is: " << dt3.count() / dt1.count() << " times faster than SoA" << "\n";
for (int i = 0; i < n; i++)
{
if ((host_vec_aos[i].t != host_arr[i + 0 * n]) || (host_vec_aos[i].t != hybrid_soa.t[i])) {
std::cout << "something went wrong" << "\n";
break;
}
assert(host_vec_aos[i].t == 1.0);
assert(host_arr[i + 0 * n] == 1.0);
assert(hybrid_soa.t[i] == 1.0);
}
hipFree(dev_aos);
hipFree(dev_arr);
return 0;
}
Running the layout examples on the V100 and MI100 GPUs shows https://github.com/zjin-lcf/HeCBench/tree/master/layout-cuda https://github.com/zjin-lcf/HeCBench/tree/master/layout-hip
CUDA
Average kernel execution time (AoS): 721.129 (us)
PASS
Average kernel execution time (SoA): 171.567 (us)
PASS
HIP
Average kernel execution time (AoS): 1936.43 (us)
PASS
Average kernel execution time (SoA): 1848.72 (us)
PASS
Hello,
Thank you for your response. I went and tested my above snippet again on NVIDIA RTX 2070 and AMD MI50 and found: RTX 2070 (NVIDIA)
Size of AoS struct is 32 bytes
Size of SoA struct is 88 bytes
SrA took: 7.390207e-03 seconds
AoS took: 6.942387e-03 seconds
SoA took: 6.966799e-03 seconds
Straight array is: 9.394036e-01 times faster than AoS
Straight array is: 9.427069e-01 times faster than SoA
MI50 (AMD)
Size of AoS struct is 32 bytes
Size of SoA struct is 88 bytes
SrA took: 3.714583e-03 seconds
AoS took: 2.837258e-03 seconds
SoA took: 2.807818e-03 seconds
Straight array is: 7.638160e-01 times faster than AoS
Straight array is: 7.558905e-01 times faster than SoA
so it now appears that SoA and AoS and performing faster than using straight arrays? When I posted this question, the MI50s were reporting that the straight array case was at least 2x faster than both AoS and SoA. I'm not sure what is happening. Will test the code you linked and report back.
The tests linked above cause Segmentation faults for me.
Thank you for reporting the issue. I will update the codes slightly.
Please type: ./main 1000
I also ran you HIP program on an MI100:
./a.out Size of AoS struct is 32 bytes Size of SoA struct is 88 bytes SrA took: 3.666829e-03 seconds AoS took: 2.167618e-03 seconds SoA took: 2.182816e-03 seconds Straight array is: 5.911424e-01 times faster than AoS Straight array is: 5.952871e-01 times faster than SoA
./a.out Size of AoS struct is 32 bytes Size of SoA struct is 88 bytes SrA took: 3.774229e-03 seconds AoS took: 2.164001e-03 seconds SoA took: 2.152870e-03 seconds Straight array is: 5.733624e-01 times faster than AoS Straight array is: 5.704132e-01 times faster than SoA
So somehow in my snippet, using straight arrays is proving slower than the SoA and AoS constructs? Interesting.... Your code on RTX 3060 gives
Average kernel execution time (AoS): 688.792 (us)
PASS
Average kernel execution time (SoA): 244.339 (us)
PASS
and on MI50 (Edit: I think there might be something wrong with AMD cluster at the moment. Standing by.....)
Average kernel execution time (AoS): 65.5724 (us)
FAIL
Average kernel execution time (SoA): 59.2822 (us)
FAIL
@EigenDev The fact that you're seeing variance in your tests leads me to guess that you're seeing run-to-run variability due to e.g. clock speed ramp-up times or other factors. I would recommend running your test kernels a few times in a loop and gathering their average performance. Many AMD GPUs will take a few milliseconds to ramp clock frequencies if you're starting from an idle GPU.
On a Radeon VII I just tested, putting a for (int i = 0; i < 10; i++) { loop around each of the kernel dispatches yields the following:
Size of AoS struct is 32 bytes
Size of SoA struct is 88 bytes
SrA took: 2.633087e-02 seconds
AoS took: 2.598574e-02 seconds
SoA took: 2.628838e-02 seconds
Straight array is: 9.868923e-01 times faster than AoS
Straight array is: 9.983863e-01 times faster than SoA
So roughly each kernel takes about 2.6ms on my test system, which means at least the SrA kernel is likely to be noisy if you only run it once (e.g., it spends >50% of its runtime at some lower clock frequency).
Running these 100 times, and I see that the AoS version of your kernel may be slightly faster (maybe 1%), and if I had to guess it's because in SoA and SrA you end up waiting for the data to return each time around your unrolled loop (you can use the roc-obj tool to do an assembly dump of all three generated kernels -- you'll see that there are 4 sets of "load/waitcnt/store" in the SrA and SoA kernels while the AoS kernel does a pair of wider loads, then a single waitcnt, then a wider set of stores).
@jlgreathouse Thank you for this insight! Ultimately, I wanted to know for certain if the AoS approach was harming me in any way in one of my codes, and it doesn't appear that way. By the way, is there a reference in the HIP ROCm docs about row-major versus column-major memory layouts? I remember hearing about a year ago that CUDA's threads are row-major while ROCm's are column-major but I can't find any documentation on that.
As for @zjin-lcf , I ran ./main 1000 on an MI50 and got:
Average kernel execution time (AoS): 7818.68 (us)
PASS
Average kernel execution time (SoA): 1607.53 (us)
PASS
but I'm not sure why the difference is so large in your test case..
@jlgreathouse Can you please take a look at the variance in performance across AMD GPUs ? Thanks
./main 1000
MI50-60
Average kernel execution time (AoS): 7818.68 (us)
PASS
Average kernel execution time (SoA): 1607.53 (us)
PASS
MI100
Average kernel execution time (AoS): 1920.56 (us)
PASS
Average kernel execution time (SoA): 1831.33 (us)
PASS
MI200
Average kernel execution time (AoS): 721.253 (us)
PASS
Average kernel execution time (SoA): 979.484 (us)
PASS
By the way, is there a reference in the HIP ROCm docs about row-major versus column-major memory layouts? I remember hearing about a year ago that CUDA's threads are row-major while ROCm's are column-major but I can't find any documentation on that.
@EigenDev I'm not sure if I'm following your question, as you mention both memory layouts and threads being row- or column-major. If you're talking about how threads are scheduled to the device: on existing AMD accelerators that are supported by ROCm, we primarily dispatch workgroups (and threads within those workgroups) in linearized order. We first sweep across the X dimension, then the Y dimension, and finally the Z dimension. So a dim3(2,3,4) workgroup dispatch (X=2, Y=#, Z=4) would first launch workgroup (X=0, Y=0, Z=0), then workgroup (X=1, Y=0, Z=0), then workgroup (X=0, Y=1, Z=0), etc.
@jlgreathouse Thank you for the confirmation. I was curious because I notice a considerable drop in performance when going from 1D AoS to 2D AoS scheme in a hydro code that I have, and I was wondering if coalescing was the issue. My data is row-major by default, and I didn't know if computing stencils in the y-direction was the key source of the performance degradation. I am still learning how to read the rocprof output, so it will take some time for me to find the bottle neck. Thank you again for your information and taking the time to respond.
@EigenDev Hi, is your issue resolved on the latest HIP? If so can we close this ticket?