VkFFT
VkFFT copied to clipboard
Fail to compile
hi, it seems increasing kernel sizes any number above 8192 floats (w 2 batches) causes a parsing error (VKFFT_ERROR_FAILED_SHADER_PARSE) for batched convolutions. i've reproduced this in many setting but perhaps the simplest is by changing the kernel size in sample #52 (code below). Below is also the exact error msg. I tried checking any publicized limits but didnt find any limit that applies (hardware: AMD Radeon Pro 5300M 4 GB, macos). Any help would be much appreciated, thank you.
- error msg
#version 450
layout (local_size_x = 128, local_size_y = 1, local_size_z = 1) in; const float loc_PI = 3.1415926535897932384626433832795f; const float loc_SQRT1_2 = 0.70710678118654752440084436210485f; layout(std430, binding = 0) buffer DataIn{ vec2 inputs[16386]; };
layout(std430, binding = 1) buffer DataOut{ vec2 outputs[16386]; };
layout(std430, binding = 2) buffer Kernel_FFT{ vec2 kernel_obj[16386]; };
void main() { uint id_x = gl_GlobalInvocationID.x % 4096; uint id_y = (gl_GlobalInvocationID.x / 4096) % 1; uint id_z = (gl_GlobalInvocationID.x / 4096) / 1; if (gl_GlobalInvocationID.x < 4096){ uint inoutID = (id_x + id_y8193 +id_z8193) + (null) * 8193; uint inoutID2; uint inoutID3; vec2 t0 = inputs[inoutID]; vec2 tf; if (id_x == 0) { inoutID2 = (8192 + id_y8193 +id_z8193) + (null) * 8193; inoutID3 = (4096 + id_y8193 +id_z8193) + (null) * 8193; tf = inputs[inoutID3]; } else { inoutID2 = ((8192-id_x) + id_y8193 +id_z8193) + (null) * 8193; } vec2 t1 = inputs[inoutID2]; vec2 t2; vec2 t3; if (id_x == 0) { t2.x = (t0.x+t1.x); t2.y = (t0.x-t1.x); tf.y = -tf.y; tf.x = tf.x * 2; tf.y = tf.y * 2; outputs[inoutID] = t2; outputs[inoutID3] = tf; } else { t2.x = t0.x + t1.x; t2.y = t0.y + t1.y; t3.x = t0.x - t1.x; t3.y = t0.y - t1.y; float angle = (loc_PIid_x)/8192; tf.x = cos(angle); tf.y = sin(angle); t0.x = tf.xt2.y+tf.yt3.x; t0.y = -tf.yt2.y+tf.x*t3.x; t1.x = t2.x+t0.x; t1.y = -t3.y+t0.y; t0.x = t2.x-t0.x; t0.y = t3.y+t0.y; outputs[inoutID] = t0; outputs[inoutID2] = t1; } } }
ERROR: 0:23: 'null' : undeclared identifier ERROR: 0:23: '=' : cannot convert from ' temp float' to ' temp highp uint' ERROR: 0:23: '' : compilation terminated ERROR: 3 compilation errors. No code generated.
VkFFT shader type: 0
- error reproduction by changing sample 52:
VkFFTResult resFFT = VKFFT_SUCCESS; VkResult res = VK_SUCCESS; //Configuration + FFT application. VkFFTConfiguration configuration = {}; VkFFTConfiguration convolution_configuration = {}; VkFFTApplication app_convolution = {}; VkFFTApplication app_kernel = {}; //Convolution sample code //Setting up FFT configuration. FFT is performed in-place with no performance loss.
std::cout << "size:" << 2*real_kernel.size() << std::endl;
configuration.FFTdim = 1; //FFT dimension, 1D, 2D or 3D (default 1).
configuration.size[0] = 8192*2; //Multidimensional FFT dimensions sizes (default 1). For best performance (and stability), order dimensions in descendant size order as: x>y>z.
configuration.size[1] = 1;
configuration.size[2] = 1;
configuration.kernelConvolution = true; //specify if this plan is used to create kernel for convolution
configuration.performR2C = true; //Perform R2C/C2R transform. Can be combined with all other options. Reduces memory requirements by a factor of 2. Requires special input data alignment: for x*y*z system pad x*y plane to (x+2)*y with last 2*y elements reserved, total array dimensions are (x*y+2y)*z. Memory layout after R2C and before C2R can be found on github.
configuration.coordinateFeatures = 1; //Specify dimensionality of the input feature vector (default 1). Each component is stored not as a vector, but as a separate system and padded on it's own according to other options (i.e. for x*y system of 3-vector, first x*y elements correspond to the first dimension, then goes x*y for the second, etc).
//coordinateFeatures number is an important constant for convolution. If we perform 1x1 convolution, it is equal to number of features, but matrixConvolution should be equal to 1. For matrix convolution, it must be equal to matrixConvolution parameter. If we perform 2x2 convolution, it is equal to 3 for symmetric kernel (stored as xx, xy, yy) and 4 for nonsymmetric (stored as xx, xy, yx, yy). Similarly, 6 (stored as xx, xy, xz, yy, yz, zz) and 9 (stored as xx, xy, xz, yx, yy, yz, zx, zy, zz) for 3x3 convolutions.
configuration.normalize = 1;//normalize iFFT
configuration.numberBatches = 2;
//After this, configuration file contains pointers to Vulkan objects needed to work with the GPU: VkDevice* device - created device, [uint64_t *bufferSize, VkBuffer *buffer, VkDeviceMemory* bufferDeviceMemory] - allocated GPU memory FFT is performed on. [uint64_t *kernelSize, VkBuffer *kernel, VkDeviceMemory* kernelDeviceMemory] - allocated GPU memory, where kernel for convolution is stored.
configuration.device = &vkGPU->device;
#if(VKFFT_BACKEND==0) configuration.queue = &vkGPU->queue; //to allocate memory for LUT, we have to pass a queue, vkGPU->fence, commandPool and physicalDevice pointers configuration.fence = &vkGPU->fence; configuration.commandPool = &vkGPU->commandPool; configuration.physicalDevice = &vkGPU->physicalDevice; configuration.isCompilerInitialized = 1;//compiler can be initialized before VkFFT plan creation. if not, VkFFT will create and destroy one after initialization #elif(VKFFT_BACKEND==3) configuration.platform = &vkGPU->platform; configuration.context = &vkGPU->context; #endif //In this example, we perform a convolution for a real vectorfield (3vector) with a symmetric kernel (6 values). We use configuration to initialize convolution kernel first from real data, then we create convolution_configuration for convolution. The buffer object from configuration is passed to convolution_configuration as kernel object. //1. Kernel forward FFT. uint64_t kernelSize = ((uint64_t)configuration.numberBatches) * configuration.coordinateFeatures * sizeof(float) * 2 * (configuration.size[0] / 2 + 1) * configuration.size[1] * configuration.size[2];;
#if(VKFFT_BACKEND==0) VkBuffer kernel = {}; VkDeviceMemory kernelDeviceMemory = {}; resFFT = allocateBuffer(vkGPU, &kernel, &kernelDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, kernelSize); if (resFFT != VKFFT_SUCCESS) return resFFT; configuration.buffer = &kernel; #elif(VKFFT_BACKEND==1) cuFloatComplex* kernel = 0; res = cudaMalloc((void**)&kernel, kernelSize); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; configuration.buffer = (void**)&kernel; #elif(VKFFT_BACKEND==2) hipFloatComplex* kernel = 0; res = hipMalloc((void**)&kernel, kernelSize); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; configuration.buffer = (void**)&kernel; #elif(VKFFT_BACKEND==3) cl_mem kernel = 0; kernel = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, kernelSize, 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; configuration.buffer = &kernel; #endif
configuration.bufferSize = &kernelSize;
printf("Total memory needed for kernel: %" PRIu64 " MB\n", kernelSize / 1024 / 1024);
//Fill kernel on CPU.
float* kernel_input = (float*)malloc(kernelSize);
if (!kernel_input) return VKFFT_ERROR_MALLOC_FAILED;
for (uint64_t f = 0; f < configuration.numberBatches; f++) {
for (uint64_t v = 0; v < configuration.coordinateFeatures; v++) {
for (uint64_t k = 0; k < configuration.size[2]; k++) {
for (uint64_t j = 0; j < configuration.size[1]; j++) {
//Below is the test identity kernel for 1x1 nonsymmetric FFT, multiplied by (f * configuration.coordinateFeatures + v + 1);
for (uint64_t i = 0; i < configuration.size[0] / 2 + 1; i++) {
kernel_input[2 * i + j * (configuration.size[0] + 2) + k * (configuration.size[0] + 2) * configuration.size[1] + v * (configuration.size[0] + 2) * configuration.size[1] * configuration.size[2] + f * configuration.coordinateFeatures * (configuration.size[0] + 2) * configuration.size[1] * configuration.size[2]] = (float)(f * configuration.coordinateFeatures + v + 1.0);
kernel_input[2 * i + 1 + j * (configuration.size[0] + 2) + k * (configuration.size[0] + 2) * configuration.size[1] + v * (configuration.size[0] + 2) * configuration.size[1] * configuration.size[2] + f * configuration.coordinateFeatures * (configuration.size[0] + 2) * configuration.size[1] * configuration.size[2]] = 0;
}
}
}
}
}
//Sample buffer transfer tool. Uses staging buffer of the same size as destination buffer, which can be reduced if transfer is done sequentially in small buffers.
#if(VKFFT_BACKEND==0) resFFT = transferDataFromCPU(vkGPU, kernel_input, &kernel, kernelSize); if (resFFT != VKFFT_SUCCESS) return resFFT; #elif(VKFFT_BACKEND==1) res = cudaMemcpy(kernel, kernel_input, kernelSize, cudaMemcpyHostToDevice); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY; #elif(VKFFT_BACKEND==2) res = hipMemcpy(kernel, kernel_input, kernelSize, hipMemcpyHostToDevice); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY; #elif(VKFFT_BACKEND==3) res = clEnqueueWriteBuffer(vkGPU->commandQueue, kernel, CL_TRUE, 0, kernelSize, kernel_input, 0, NULL, NULL); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY; #endif //Initialize application responsible for the kernel. This function loads shaders, creates pipeline and configures FFT based on configuration file. No buffer allocations inside VkFFT library. resFFT = initializeVkFFT(&app_kernel, configuration); if (resFFT != VKFFT_SUCCESS) return resFFT; //Sample forward FFT command buffer allocation + execution performed on kernel. Second number determines how many times perform application in one submit. FFT can also be appended to user defined command buffers.
//Uncomment the line below if you want to perform kernel FFT. In this sample we use predefined identitiy kernel.
//performVulkanFFT(vkGPU, &app_kernel, -1, 1);
//The kernel has been trasnformed.
//2. Buffer convolution with transformed kernel.
//Copy configuration, as it mostly remains unchanged. Change specific parts.
convolution_configuration = configuration;
convolution_configuration.kernelConvolution = false;
convolution_configuration.performConvolution = true;
convolution_configuration.symmetricKernel = false;//Specify if convolution kernel is symmetric. In this case we only pass upper triangle part of it in the form of: (xx, xy, yy) for 2d and (xx, xy, xz, yy, yz, zz) for 3d.
#if(VKFFT_BACKEND==0) convolution_configuration.kernel = &kernel; #elif(VKFFT_BACKEND==1) convolution_configuration.kernel = (void**)&kernel; #elif(VKFFT_BACKEND==2) convolution_configuration.kernel = (void**)&kernel; #elif(VKFFT_BACKEND==3) convolution_configuration.kernel = &kernel; #endif
convolution_configuration.kernelSize = &kernelSize;
convolution_configuration.numberBatches = 1;//one batch - numberKernels convolutions
convolution_configuration.numberKernels = configuration.numberBatches;// number of convolutions on a single input
//Allocate separate buffer for the input data.
uint64_t inputBufferSize = ((uint64_t)convolution_configuration.coordinateFeatures) * sizeof(float) * 2 * (convolution_configuration.size[0] / 2 + 1) * convolution_configuration.size[1] * convolution_configuration.size[2];;
uint64_t bufferSize = convolution_configuration.numberKernels * convolution_configuration.coordinateFeatures * sizeof(float) * 2 * (convolution_configuration.size[0] / 2 + 1) * convolution_configuration.size[1] * convolution_configuration.size[2];;
convolution_configuration.isInputFormatted = true; //if input is a different buffer, it doesn't have to be zeropadded/R2C padded
#if(VKFFT_BACKEND==0) VkBuffer inputBuffer = {}; VkBuffer buffer = {}; VkDeviceMemory inputBufferDeviceMemory = {}; VkDeviceMemory bufferDeviceMemory = {}; resFFT = allocateBuffer(vkGPU, &inputBuffer, &inputBufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, inputBufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; resFFT = allocateBuffer(vkGPU, &buffer, &bufferDeviceMemory, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, VK_MEMORY_HEAP_DEVICE_LOCAL_BIT, bufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; convolution_configuration.inputBuffer = &inputBuffer; convolution_configuration.buffer = &buffer; #elif(VKFFT_BACKEND==1) cuFloatComplex* inputBuffer = 0; cuFloatComplex* buffer = 0; res = cudaMalloc((void**)&inputBuffer, inputBufferSize); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; res = cudaMalloc((void**)&buffer, bufferSize); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; convolution_configuration.inputBuffer = (void**)&inputBuffer; convolution_configuration.buffer = (void**)&buffer; #elif(VKFFT_BACKEND==2) hipFloatComplex* inputBuffer = 0; hipFloatComplex* buffer = 0; res = hipMalloc((void**)&inputBuffer, inputBufferSize); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; res = hipMalloc((void**)&buffer, bufferSize); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_ALLOCATE; convolution_configuration.inputBuffer = (void**)&inputBuffer; convolution_configuration.buffer = (void**)&buffer; #elif(VKFFT_BACKEND==3) cl_mem inputBuffer = 0; cl_mem buffer = 0; inputBuffer = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, inputBufferSize, 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; buffer = clCreateBuffer(vkGPU->context, CL_MEM_READ_WRITE, bufferSize, 0, &res); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_ALLOCATE; convolution_configuration.inputBuffer = &inputBuffer; convolution_configuration.buffer = &buffer; #endif
convolution_configuration.inputBufferSize = &inputBufferSize;
convolution_configuration.bufferSize = &bufferSize;
printf("Total memory needed for buffer: %" PRIu64 " MB\n", bufferSize / 1024 / 1024);
//Fill data on CPU. It is best to perform all operations on GPU after initial upload.
float* buffer_input = (float*)malloc(inputBufferSize);
if (!buffer_input) return VKFFT_ERROR_MALLOC_FAILED;
for (uint64_t v = 0; v < convolution_configuration.coordinateFeatures; v++) {
for (uint64_t k = 0; k < convolution_configuration.size[2]; k++) {
for (uint64_t j = 0; j < convolution_configuration.size[1]; j++) {
for (uint64_t i = 0; i < convolution_configuration.size[0]; i++) {
buffer_input[i + j * (convolution_configuration.size[0] + 2) + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]] = 1;
}
}
}
}
//Transfer data to GPU using staging buffer.
#if(VKFFT_BACKEND==0) resFFT = transferDataFromCPU(vkGPU, buffer_input, &inputBuffer, inputBufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; #elif(VKFFT_BACKEND==1) res = cudaMemcpy(inputBuffer, buffer_input, inputBufferSize, cudaMemcpyHostToDevice); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY; #elif(VKFFT_BACKEND==2) res = hipMemcpy(inputBuffer, buffer_input, inputBufferSize, hipMemcpyHostToDevice); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY; #elif(VKFFT_BACKEND==3) res = clEnqueueWriteBuffer(vkGPU->commandQueue, inputBuffer, CL_TRUE, 0, inputBufferSize, buffer_input, 0, NULL, NULL); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY; #endif
//Initialize application responsible for the convolution.
resFFT = initializeVkFFT(&app_convolution, convolution_configuration);
if (resFFT != VKFFT_SUCCESS) return resFFT;
//Sample forward FFT command buffer allocation + execution performed on kernel. FFT can also be appended to user defined command buffers.
VkFFTLaunchParams launchParams = {};
resFFT = performVulkanFFT(vkGPU, &app_convolution, &launchParams, -1, 1);
if (resFFT != VKFFT_SUCCESS) return resFFT;
//The kernel has been trasnformed.
float* buffer_output = (float*)malloc(bufferSize);
if (!buffer_output) return VKFFT_ERROR_MALLOC_FAILED;
//Transfer data from GPU using staging buffer.
#if(VKFFT_BACKEND==0) resFFT = transferDataToCPU(vkGPU, buffer_output, &buffer, bufferSize); if (resFFT != VKFFT_SUCCESS) return resFFT; #elif(VKFFT_BACKEND==1) res = cudaMemcpy(buffer_output, buffer, bufferSize, cudaMemcpyDeviceToHost); if (res != cudaSuccess) return VKFFT_ERROR_FAILED_TO_COPY; #elif(VKFFT_BACKEND==2) res = hipMemcpy(buffer_output, buffer, bufferSize, hipMemcpyDeviceToHost); if (res != hipSuccess) return VKFFT_ERROR_FAILED_TO_COPY; #elif(VKFFT_BACKEND==3) res = clEnqueueReadBuffer(vkGPU->commandQueue, buffer, CL_TRUE, 0, bufferSize, buffer_output, 0, NULL, NULL); if (res != CL_SUCCESS) return VKFFT_ERROR_FAILED_TO_COPY; #endif
//Print data, if needed.
for (uint64_t f = 0; f < convolution_configuration.numberKernels; f++) {
printf("\nKernel id: %" PRIu64 "\n\n", f);
for (uint64_t v = 0; v < convolution_configuration.coordinateFeatures; v++) {
printf("\ncoordinate: %" PRIu64 "\n\n", v);
for (uint64_t k = 0; k < convolution_configuration.size[2]; k++) {
for (uint64_t j = 0; j < convolution_configuration.size[1]; j++) {
for (uint64_t i = 0; i < convolution_configuration.size[0]; i++) {
printf("%.6f ", buffer_output[i + j * (convolution_configuration.size[0] + 2) + k * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] + v * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2] + f * convolution_configuration.coordinateFeatures * (convolution_configuration.size[0] + 2) * convolution_configuration.size[1] * convolution_configuration.size[2]]);
}
std::cout << "\n";
}
}
}
}
free(kernel_input);
free(buffer_input);
free(buffer_output);
#if(VKFFT_BACKEND==0) vkDestroyBuffer(vkGPU->device, inputBuffer, NULL); vkFreeMemory(vkGPU->device, inputBufferDeviceMemory, NULL); vkDestroyBuffer(vkGPU->device, buffer, NULL); vkFreeMemory(vkGPU->device, bufferDeviceMemory, NULL); vkDestroyBuffer(vkGPU->device, kernel, NULL); vkFreeMemory(vkGPU->device, kernelDeviceMemory, NULL); #elif(VKFFT_BACKEND==1) cudaFree(inputBuffer); cudaFree(buffer); cudaFree(kernel); #elif(VKFFT_BACKEND==2) hipFree(inputBuffer); hipFree(buffer); hipFree(kernel); #elif(VKFFT_BACKEND==3) clReleaseMemObject(inputBuffer); clReleaseMemObject(buffer); clReleaseMemObject(kernel); #endif deleteVkFFT(&app_kernel); deleteVkFFT(&app_convolution); return resFFT;
Hello,
Currently, I have not implemented convolutions codegen for multiple-upload sequences - and this is what exactly happens after 8192 on AMD GPUs. In general, the algorithm is not different from what is already there, I just haven't had a particular usage for it before. I will hopefully have it covered once I reorganize the codebase in the near future.
Best regards, Dmitrii
great, thank you.