Overallocation of workgroup buffer inside compute shader causes memory corruption
Describe the bug
If you create a compute shader with workgroup buffers that need more memory than the allowed limit then writing to one buffer will cause corruption in other buffers.
To Reproduce
Code attached at bottom.
Observed behaviour
If I write to shared buffer and then write to another shared buffer, the second shared buffor has results overlapping with the previous buffer in read time.
Screenshots
NA
Your environment
██ system:
platform: Windows-10-10.0.26100-SP0
python_implementation: CPython python: 3.10.16
██ versions:
wgpu: 0.22.2 cffi: 1.17.1 numpy: 2.2.6
██ wgpu_native_info:
expected_version: 24.0.3.1 lib_version: 24.0.3.1 lib_path: .\resources\wgpu_native-release.dll
██ object_counts:
count resource_mem
Adapter: 1
BindGroup: 0
BindGroupLayout: 0
Buffer: 0 0
CanvasContext: 0
CommandBuffer: 0
CommandEncoder: 0
ComputePassEncoder: 0 ComputePipeline: 0 Device: 1 PipelineLayout: 0 QuerySet: 0 Queue: 1 RenderBundle: 0 RenderBundleEncoder: 0 RenderPassEncoder: 0 RenderPipeline: 0 Sampler: 0 ShaderModule: 0 Texture: 0 TextureView: 0
total: 3 0
██ wgpu_native_counts:
count mem hub a k r el_size
Adapter: 1 8 hub: 1 1 0 8
BindGroup: 0 0 hub: 0 0 1 16
BindGroupLayout: 0 0 hub: 0 0 1 16 Buffer: 0 0 hub: 0 0 7 16 CanvasContext: 0 0 0 0 0 8 CommandBuffer: 0 0 hub: 0 0 1 8 ComputePipeline: 0 0 hub: 0 0 1 16 Device: 1 8 hub: 1 1 0 8 PipelineCache: 0 0 hub: 0 0 0 16 PipelineLayout: 0 0 hub: 0 0 1 16 QuerySet: 0 0 hub: 0 0 0 16 Queue: 1 8 hub: 1 1 0 8 RenderBundle: 0 0 hub: 0 0 0 16 RenderPipeline: 0 0 hub: 0 0 0 16 Sampler: 0 0 hub: 0 0 0 16 ShaderModule: 2 32 hub: 2 2 1 16 Texture: 0 0 hub: 0 0 0 16 TextureView: 0 0 hub: 0 0 0 16
total: 5 56
* The a, k, r are allocated, kept, and released, respectively.
* Reported memory does not include buffer/texture data.
Steps to reproduce
const RANDOM_FLOAT_ARRAY_LENGTH:u32 = 256;
const OBJECTIVE_FUNCTION_OUTPUT_LENGTH:u32 = 63;
const OBJECTIVE_FUNCTION_INPUT_LENGTH:u32 = 195;
const SVD_EPSILON:f32 = 1e-10;
const WORKGROUP_SIZE_X:u32 = 8;
const WORKGROUP_SIZE_Y :u32 = 8;
struct State{
random_vector_norm:f32
}
const WORKGROUP_LENGTH = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y;
const OBJECTIVE_FUNCTION_JACOBIAN_SIZE = OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_INPUT_LENGTH;
@group(0) @binding(1)
var<storage, read> jacobian: array<f32>;
@group(0) @binding(2)
var<storage, read_write> U: array<f32>;
var<workgroup> workgroup_shared_obj_out_len_buf: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> workgroup_shared_obj_inp_len_buf: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> jacobian_workgroup_copy: array<
f32,
OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> decomposed_jacobian: array<
f32,
OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> one_dim_svd_b: array<
f32,
OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> current_u_buffer: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> current_v_buffer: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> state: State;
fn row_major_access_jac(r: u32, c:u32)->u32{
return r * OBJECTIVE_FUNCTION_INPUT_LENGTH + c;
}
fn col_major_access_jac(r: u32, c:u32)->u32{
return c * OBJECTIVE_FUNCTION_OUTPUT_LENGTH + r;
}
fn sweep_calculator(length: u32, workgroup_size: u32) -> u32 {
return (length + workgroup_size - 1u) / workgroup_size; // ceiling division
}
fn copy_jacobian_into_local_buffers(
local_invocation_index: u32,
){
let total_elements = OBJECTIVE_FUNCTION_JACOBIAN_SIZE;
let workgroup_size = WORKGROUP_LENGTH; // e.g., 64
var index = local_invocation_index;
while (index < total_elements) {
jacobian_workgroup_copy[index] = jacobian[index];
decomposed_jacobian[index] = jacobian[index];
index += workgroup_size;
}
workgroupBarrier();
}
fn fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(
local_invocation_id : vec3<u32>,
global_invocation_id : vec3<u32>,
local_invocation_index: u32,
){
let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
for(var iter = 0u ; iter < sweeps ; iter++){
let index = WORKGROUP_LENGTH * iter + local_invocation_index;
if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
workgroup_shared_obj_out_len_buf[index] = 9.9;
current_u_buffer[index] = 9.9;
}
}
workgroupBarrier();
}
fn fill_one_dim_svd_b_buffer(
local_invocation_id : vec3<u32>,
local_invocation_index: u32,
){
let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH* OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
for(var iter = 0u ; iter < sweeps ; iter++){
let index = iter * WORKGROUP_LENGTH + local_invocation_index;
if(index >= OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH){continue;}
let row = index / OBJECTIVE_FUNCTION_OUTPUT_LENGTH;
let col = index % OBJECTIVE_FUNCTION_OUTPUT_LENGTH;
var b_val = 0.0;
for(var z_index:u32 = 0u ; z_index < OBJECTIVE_FUNCTION_INPUT_LENGTH ; z_index++ ){
let left_matrix_index = row_major_access_jac(row, z_index);
let right_matrix_index = row_major_access_jac(col, z_index);
b_val += decomposed_jacobian[left_matrix_index] * decomposed_jacobian[right_matrix_index];
}
one_dim_svd_b[index] = b_val;
}
workgroupBarrier();
}
fn populate_u_with_next_singular_vector(
local_invocation_id : vec3<u32>,
global_invocation_id : vec3<u32>,
local_invocation_index: u32,
){
fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(local_invocation_id, global_invocation_id, local_invocation_index);
workgroupBarrier();
fill_one_dim_svd_b_buffer(local_invocation_id, local_invocation_index);
workgroupBarrier();
let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
for(var iter = 0u ; iter < sweeps ; iter++){
let index = WORKGROUP_LENGTH * iter + local_invocation_index;
if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
U[index] = one_dim_svd_b[index];
}
}
workgroupBarrier();
}
@compute @workgroup_size(WORKGROUP_SIZE_X, WORKGROUP_SIZE_Y)
fn main(
@builtin(local_invocation_id) local_invocation_id : vec3<u32>,
@builtin(global_invocation_id) global_invocation_id : vec3<u32>,
@builtin(local_invocation_index) local_invocation_index: u32,
) {
copy_jacobian_into_local_buffers(local_invocation_index);
populate_u_with_next_singular_vector(local_invocation_id, global_invocation_id, local_invocation_index);
}
In the above code, if you comment out the write to workgroup_shared_obj_out_len_buf or remove redundant workgroup shared buffers, the code works and one_dim_svd_b and U will have the intended values. Otherwise one_dim_svd_b will get corrupted and U will have incorrect values. Ideally the compiler should error on compile time if configures the shader to use more shared memory than available.
is this with Dx12 or Vulkan? I know that Dx12 has some issues with compute shader sin the current and next version of wgpu.
Perhaps you can give the wgpu25 branch a try as that includes upstream bugfixes. If you can also get this problem in wgpu-native it's not a problem with the bindings and perhaps validation along the way.
Thanks for reporting this. Do you perhaps also have the Python code to run the shader with, to make it a truely reproducable example?